In [11]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool
import plotly.graph_objects as go
import plotly.graph_objs as go
import re
import plotly.express as px
from collections import Counter
from wordcloud import WordCloud


**Read data**

In [4]:
train=pd.read_csv("/content/drive/My Drive/DataSets/archive/Corona_NLP_train.csv",encoding='latin1');
test=pd.read_csv("/content/drive/My Drive/DataSets/archive/Corona_NLP_test.csv",encoding='latin1');

df = train.append(test, sort = False)

df.head()


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


**Data structure**

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44955 entries, 0 to 3797
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       44955 non-null  int64 
 1   ScreenName     44955 non-null  int64 
 2   Location       35531 non-null  object
 3   TweetAt        44955 non-null  object
 4   OriginalTweet  44955 non-null  object
 5   Sentiment      44955 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.4+ MB


**Sentiment Class Distrubution**

In [6]:
df_dist = df['Sentiment'].value_counts()

fig = go.Figure([go.Bar(x=df_dist.index, y=df_dist.values)])
fig.update_layout(width=800, showlegend=False, xaxis_title="Sentiments", yaxis_title="Count")
fig.show()


**Tweet Locations**

In [7]:
loc = df['Location'].value_counts().nlargest(n=10)

fig = px.bar(y=loc.values, x=loc.index, orientation='v', color=loc.index, text=loc.values)

fig.update_traces(texttemplate='%{text:.2s}', textposition='inside', marker_line_color='rgb(8,48,107)', marker_line_width=2)

fig.update_layout(width=800, showlegend=False, xaxis_title="Location", yaxis_title="Count")
fig.show()

**Hashtags**

In [8]:
def extract_hash_tags(s):
    hashes = re.findall(r"#(\w+)", s)
    return " ".join(hashes)

df['hashtags'] = df['OriginalTweet'].apply(lambda x : extract_hash_tags(x))
allHashTags = list(df[(df['hashtags'] != None) & (df['hashtags'] != "")]['hashtags'])
allHashTags = [tag.lower() for tag in allHashTags]
hash_df = dict(Counter(allHashTags))
top_hash_df = pd.DataFrame(list(hash_df.items()),columns = ['word','count']).reset_index(drop=True).sort_values('count',ascending=False)[:10]

fig = px.bar(x=top_hash_df['word'],y=top_hash_df['count'],
       orientation='v',
       color=top_hash_df['word'],
       text=top_hash_df['count'],
       color_discrete_sequence= px.colors.qualitative.Bold)

fig.update_traces(texttemplate='%{text:.2s}', textposition='inside', marker_line_color='rgb(8,48,107)', marker_line_width=2)
fig.update_layout(width=800, showlegend=False, xaxis_title="Word", yaxis_title="Count")
fig.show()

**Mentions**

In [9]:
# Get all mentions

def get_mentions(s):
    mentions = re.findall("(?<![@\w])@(\w{1,25})", s)
    return " ".join(mentions)
df['mentions'] = df['OriginalTweet'].apply(lambda x : get_mentions(x))

allMentions = list(df[(df['mentions'] != None) & (df['mentions'] != "")]['mentions'])
allMentions = [tag.lower() for tag in allMentions]
mentions_df = dict(Counter(allMentions))
top_mentions_df = pd.DataFrame(list(mentions_df.items()),columns = ['word','count']).reset_index(drop=True).sort_values('count',ascending=False)[:20]

fig = px.bar(x=top_mentions_df['word'],y=top_mentions_df['count'],
       orientation='v',
       color=top_mentions_df['word'],
       text=top_mentions_df['count'],
       color_discrete_sequence= px.colors.qualitative.Bold)

fig.update_traces(texttemplate='%{text:.2s}', textposition='inside', marker_line_color='rgb(8,48,107)', marker_line_width=2)
fig.update_layout(width=800, showlegend=False, xaxis_title="Word", yaxis_title="Count")
fig.show()


**Commonly Used Words**

In [None]:
temp = " ".join(df["OriginalTweet"].tolist())

wc = WordCloud(width=400, height=400, collocations= False, max_words= 50).generate(temp)

plt.figure(figsize=(12, 10))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

**Data description**

In [None]:
df.describe() 

In [None]:
df['OriginalTweet']=df['OriginalTweet'].astype(str)
df['Sentiment']=df['Sentiment'].astype(str)

train['OriginalTweet']=train['OriginalTweet'].astype(str)
train['Sentiment']=train['Sentiment'].astype(str)

test['OriginalTweet']=test['OriginalTweet'].astype(str)
test['Sentiment']=test['Sentiment'].astype(str)

display(train.isnull().sum().sort_values(ascending=False))

In [None]:
class_df = df.groupby('Sentiment').count()['text'].reset_index().sort_values(by=' The hasta',ascending=False)
class_df.style.background_gradient(cmap='winter')