In [3]:
import json
import pandas as pd
import re
from tqdm import tqdm
import langdetect as ld
from textblob import TextBlob
# load data using Python JSON module
with open('messages.json','r') as f:
    data = json.loads(f.read())
# Flatten data
df_messages = pd.json_normalize(data, record_path =['messages'])
tqdm.pandas()

In [4]:
def clean_text(text):
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)|{.*?}|[\([{})\]]", " ", str(text)).split())   #Remove digits, brackets and hyperlinks
    return text

df_messages['message_copy'] = df_messages['text'].progress_apply(clean_text)

100%|██████████| 49436/49436 [00:01<00:00, 45742.92it/s]


In [8]:
def is_english(txt):       #Function to remove rows with messages that are not in english
  try:
    return ld.detect(txt)=='en'   
  except:
    return False

df_m_copy = df_messages[df_messages['message_copy'].progress_apply(is_english)]

100%|██████████| 49436/49436 [06:12<00:00, 132.88it/s]


In [17]:
words = re.compile('SHIB|DOGE')
def filter_messages(txt):       #Function to emove rows with messages that doesn't contain 'SHIB' or 'DOGE'
  if words.search(str(txt)):    
    return True
  return False

df_m_copy = df_m_copy[df_m_copy['message_copy'].progress_apply(filter_messages)]

100%|██████████| 408/408 [00:00<00:00, 163071.85it/s]


In [18]:
def sentiment_analysis(txt):        #Function to analyze the sentiment of the message 
  result = TextBlob(str(txt))
  polarity = result.sentiment.polarity
  if polarity > 0: 
      return pd.Series(['positive', polarity])
  elif polarity == 0:
      return pd.Series(['neutral', polarity])
  else: 
      return pd.Series(['negative', polarity])
vars = ['sentiment', 'sentiment_score']
df_m_copy[vars] = df_m_copy['message_copy'].progress_apply(sentiment_analysis)

100%|██████████| 408/408 [00:00<00:00, 1982.07it/s]


In [19]:
from collections import defaultdict
def extract_daywise_data():       #Function to group the sentiment analyses, day wise.
    hmap = defaultdict(dict)
    for index, row in df_m_copy.iterrows():
        date = row['date'].partition('T')[0]
        sentiment = row['sentiment']
        score = row['sentiment_score']
        if date not in hmap:
          hmap[date]['positive'] = 0
          hmap[date]['negative'] = 0
          hmap[date]['neutral'] = 0
          hmap[date]['total'] = 0
          hmap[date]['total_score'] = 0
        hmap[date][sentiment] += 1
        hmap[date]['total'] += 1
        hmap[date]['total_score'] += score
    return hmap

day_map = extract_daywise_data()
daywise_df = pd.DataFrame.from_dict(day_map, orient='index')
daywise_df['avg_score']= daywise_df['total_score']/daywise_df['total']
daywise_df['date'] = daywise_df.index

In [22]:
import plotly.express as px       
fig = px.scatter_3d(daywise_df, x='date', y='total', z='avg_score', labels={
                     "date": "Date",
                     "total": "Number of messages",
                     "avg_score": "Average sentiment score"
                 },
                title="3D plot of number of messages and average sentiment per day")      #3D plot
fig.show()

In [21]:
fig = px.scatter(daywise_df, x="date", y="avg_score", labels={"date": "Date", "avg_score": "Average sentiment per day"})
fig.show()