## Import libraries

In [1]:
import pandas as pd
import datetime as dt

# # Visualization libraries
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# %matplotlib inline
# import seaborn as sns

# Transformer model
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


## Import Reddit data

The data contains the top posts from 3 subreddits: Datascience, MachineLearning and artificial. Data is obtained with Python Reddit API Wrapper library.

In [2]:
# Load data
posts_df = pd.read_csv('datasets/DS_ML_AI_posts.csv')
comments_df = pd.read_csv('datasets/DS_ML_AI_comments.csv')

# Number of observations - for metric cells
number_of_posts = posts_df.shape[0]
number_of_comments = comments_df.shape[0]
count_subreddits = posts_df['subreddit'].nunique()

In [3]:
# Convert created date to normal datetime
posts_df['created_date'] = posts_df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(x))
posts_df['created_year'] = posts_df['created_date'].dt.year
# posts_df

In [5]:
# Merge posts with their comments
comments_posts_df = posts_df.merge(comments_df, on='post_id', how='left')

# Remove rows with missing comments
comments_posts_df = comments_posts_df[~comments_posts_df['comment'].isnull()]

# EDA

In [6]:
# posts_df

In [9]:
from lets_plot import * 
ggplot() + \
geom_bar(aes(x="created_year", y="..count..", color="subreddit", fill="subreddit"), data=posts_df, sampling="none" if posts_df.size < 50 else sampling_pick(n=50)) + \
ggtitle("Number of posts by year") 

## ðŸŒ¤ Wordcloud post titles

In [10]:
post_title_text = ' '.join([title for title in posts_df['post_title'].str.lower()])

word_cloud = WordCloud(collocation_threshold = 2, width=1000, height=500,
                        background_color = 'white'
                       ).generate(post_title_text)

# Display the generated Word Cloud
plt.figure( figsize=(10,5) )
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

## ðŸŒ¤ Wordcloud post titles by year

In [11]:
selected_year = 2023

In [12]:
posts_in_year = posts_df[posts_df['created_year'] == selected_year]
post_title_text_year = ' '.join(item for item in posts_in_year[~posts_in_year['post_title'].isna()]['post_title'])

word_cloud = WordCloud(collocation_threshold = 2, width=1000, height=500,
                        background_color = 'white',
                    ).generate(post_title_text_year)

# Display the generated Word Cloud
plt.figure( figsize=(10,5) )
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

## ðŸ¤” Sentiment analysis

In [7]:
word_input = "chatgpt"

In [8]:
sentiment_classifier = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [9]:
def get_sentiment(text):
    # Get sentiment prediction scores
    try:
        sentiment = sentiment_classifier(text)[0]['label']
    except:   
        sentiment = 'Not classified'

    return sentiment

In [10]:
comments_posts_df_sub = comments_posts_df[comments_posts_df['post_title'].str.contains(word_input)]
# comments_posts_df_sub

In [None]:
comments_posts_df_sub['sentiment'] = comments_posts_df_sub['comment'].astype(str).apply(lambda x: get_sentiment(x))
# comments_posts_df_sub

In [18]:
# Lets-plot library https://lets-plot.org/index.html
from lets_plot import *
from lets_plot.mapping import *

ggplot(comments_posts_df_sub) + geom_pie(aes(fill=as_discrete('sentiment', order_by='..count..')), 
                                            size=30, hole=0.2, stroke=1.0, 
                                            labels=layer_labels()
                                                    .line('@sentiment')
                                                    .line('(@{..prop..})')
                                                    .format('..prop..', '.0%')
                                        ) \
                              + theme(line=element_blank(), axis_text=element_blank(), 
                                      axis_title=element_blank(), legend_position='none') \
                              + ggtitle('Sentiment of around the topic')

## ðŸ¤¯ Emotion recognition

In [19]:
emotion_classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

In [27]:
def get_emotion(text):
    # Get emotion prediction scores
    pred_scores = emotion_classifier(text)

    # Get emotion with highest prediction score
    emotion = max(pred_scores[0], key=lambda x: x['score'])['label']

    return emotion

In [18]:
comments_posts_df_sub['emotion'] = comments_posts_df_sub['comment'].astype(str).apply(lambda x: get_emotion(x))
comments_posts_df_sub

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comments_posts_df_sub['emotion'] = comments_posts_df_sub['comment'].astype(str).apply(lambda x: get_emotion(x))


Unnamed: 0,post_id,subreddit,created_utc,selftext,post_url,post_title,link_flair_text,score,num_comments,upvote_ratio,created_date,created_year,comment,sentiment,emotion
97310,1174kud,artificial,1.676893e+09,,https://i.redd.it/g6c8lxiygdja1.jpg,"fine, let's just get chatgpt cancelledðŸ’€",News,252,54,0.79,2023-02-20 11:42:57,2023,How many time has this picture been screen cap...,NEG,sadness
97311,1174kud,artificial,1.676893e+09,,https://i.redd.it/g6c8lxiygdja1.jpg,"fine, let's just get chatgpt cancelledðŸ’€",News,252,54,0.79,2023-02-20 11:42:57,2023,Unbased gpt,NEU,fear
97312,1174kud,artificial,1.676893e+09,,https://i.redd.it/g6c8lxiygdja1.jpg,"fine, let's just get chatgpt cancelledðŸ’€",News,252,54,0.79,2023-02-20 11:42:57,2023,THIS IS BRAND NEW INFORMATION,POS,joy
97313,1174kud,artificial,1.676893e+09,,https://i.redd.it/g6c8lxiygdja1.jpg,"fine, let's just get chatgpt cancelledðŸ’€",News,252,54,0.79,2023-02-20 11:42:57,2023,Hereâ€™s some actual data on cgpt bias for anyon...,NEU,joy
97314,1174kud,artificial,1.676893e+09,,https://i.redd.it/g6c8lxiygdja1.jpg,"fine, let's just get chatgpt cancelledðŸ’€",News,252,54,0.79,2023-02-20 11:42:57,2023,This ethical censorship bs just needs to go al...,NEG,anger
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108261,104nxq2,artificial,1.672990e+09,"after using chatgpt for a couple of weeks, ive...",https://www.reddit.com/r/artificial/comments/1...,chatgpt has massively improved my productivity...,Question,104,18,0.93,2023-01-06 07:25:29,2023,"it would probably be able to do that, its a bi...",NEU,anger
108262,104nxq2,artificial,1.672990e+09,"after using chatgpt for a couple of weeks, ive...",https://www.reddit.com/r/artificial/comments/1...,chatgpt has massively improved my productivity...,Question,104,18,0.93,2023-01-06 07:25:29,2023,"You.com is built on GPT, so not exactly a comp...",NEU,anger
108263,104nxq2,artificial,1.672990e+09,"after using chatgpt for a couple of weeks, ive...",https://www.reddit.com/r/artificial/comments/1...,chatgpt has massively improved my productivity...,Question,104,18,0.93,2023-01-06 07:25:29,2023,interesitng thanks ill check it out.\n\nwhats ...,Not classified,anger
108264,104nxq2,artificial,1.672990e+09,"after using chatgpt for a couple of weeks, ive...",https://www.reddit.com/r/artificial/comments/1...,chatgpt has massively improved my productivity...,Question,104,18,0.93,2023-01-06 07:25:29,2023,For me it has been a game changer. Keep in min...,POS,joy


In [19]:
from lets_plot import * 
ggplot() + \
geom_bar(aes(x="emotion", y="..count.."), data=comments_posts_df_sub, sampling="none" if comments_posts_df_sub.size < 50 else sampling_pick(n=50)) + \
ggtitle("Emotions around the topic") 

# ðŸ¤– Ask-Me-Anything chatbot (ChatGPT API + Reddit data)

In [20]:
# Import modules from llama_index and langchain
from llama_index import SimpleDirectoryReader, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
from langchain import OpenAI
import os
from IPython.display import Markdown, display

In [29]:
# Combine all posts and comments and save to a text file
comments_posts_df_tmp = comments_posts_df[['post_title', 'selftext','comment']].astype(str)
agg_comments = comments_posts_df_tmp.groupby(['post_title', 'selftext'])['comment'].apply('. '.join).reset_index()
agg_comments

Unnamed: 0,post_title,selftext,comment
0,"""Artificial Imagination"" - AI generated",,Why does everything look familiar but nothing ...
1,"""At least 40% of startups in Europe that claim...",**Read Article:** [https://www.theverge.com/20...,That's much smaller than I expected. I'd expec...
2,"""Do I need to know {insert advanced math} to g...","These posts occur with some regularity, and {i...",">These posts occur with some regularity, and {..."
3,"""Floraison d'hiver"" (Winter Bloom), creating d...",,"Hello everyone!\n\nThis is my ""Floraison d'hiv..."
4,"""Humans can decipher adversarial images"": A st...",,(spends a 100 years teaching computers to thin...
...,...,...,...
2961,"ðŸ“Œ[Searchcolab] ""Gotham during Recession"" Link ...",,These are fucking hilarious.. Spiderman in Got...
2962,ðŸ“Œ[Searchcolab] Text-To-4D Dynamic Scene Genera...,,I figured this must be around the corner with ...
2963,ðŸ˜± AI Senses People Through Walls - by MIT,,Wall hacks. Great for RoboCop prototypes. No l...
2964,ðŸ˜² Types of Artificial Intelligence,,Although this seems possible and is the roadma...


In [23]:
# (ONLY NEED TO RUN AND SAVE ONCE)
# agg_comments['combined_text'] = agg_comments.astype(str).agg('. '.join, axis=1)
# all_text = ' '.join(agg_comments['combined_text'])

# Save text to txt file
# f = open("textdata/all_text_reddit.txt", "w") 
# f.write(all_text)
# f.close()

In [21]:
# Adapted based on LlamaIndex documentation https://gpt-index.readthedocs.io/en/latest/index.html
# and Dan Shipper's work https://www.lennysnewsletter.com/p/i-built-a-lenny-chatbot-using-gpt

def construct_index(directory_path):
    # set maximum input size
    max_input_size = 4096
    # set number of output tokens
    num_outputs = 256
    # set maximum chunk overlap
    max_chunk_overlap = 20
    # set chunk size limit
    chunk_size_limit = 600

    # define LLM (ChatGPT gpt-3.5-turbo)
    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo", max_tokens=num_outputs))
    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
 
    documents = SimpleDirectoryReader(directory_path).load_data()
    
    index = GPTSimpleVectorIndex(
        documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
    )

    index.save_to_disk('index.json')

    return index


def ask_me_anything(question):

    index = GPTSimpleVectorIndex.load_from_disk('index.json')
    response = index.query(question, response_mode="compact")

    display(Markdown(f"You asked: <b>{question}</b>"))
    display(Markdown(f"Bot says: <b>{response.response}</b>"))

In [32]:
openai_key = "XXXXX"

In [23]:
# Set OpenAI key
os.environ["OPENAI_API_KEY"] = openai_key

In [7]:
# Construct our index (ONLY NEED TO RUN ONCE! BE CAREFUL THAT THIS COSTS MONEY)
# This will take every file in folder, split it into chunks, and embed it with OpenAI's embeddings API. 
construct_index('/data/notebook_files/textdata')

In [29]:
question = "Is it hard to learn data science?"

In [30]:
# Run Reddit chatbot
ask_me_anything(question)

INFO:root:> [query] Total LLM token usage: 637 tokens
INFO:root:> [query] Total embedding token usage: 8 tokens


You asked: <b>Is it hard to learn data science?</b>

Bot says: <b>
No, it is not hard to learn data science. Data science involves the use of data to solve problems and make decisions. It requires knowledge of mathematics, statistics, computer science, and other related fields. With the right resources and dedication, anyone can learn the basics of data science.</b>