In [57]:
import numpy as np # linear algebra
import pandas as pd # data processing

import os

In [58]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.express as px
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [59]:
covid_data = pd.read_csv('/content/covid19_tweets.csv')
covid_data.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [60]:
print('Total tweets in this data: {}'.format(covid_data.shape[0]))
print('Total Unique Users in this data: {}'.format(covid_data['user_name'].nunique()))

Total tweets in this data: 179108
Total Unique Users in this data: 92276


In [61]:
covid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179108 entries, 0 to 179107
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_name         179108 non-null  object
 1   user_location     142337 non-null  object
 2   user_description  168822 non-null  object
 3   user_created      179108 non-null  object
 4   user_followers    179108 non-null  int64 
 5   user_friends      179108 non-null  int64 
 6   user_favourites   179108 non-null  int64 
 7   user_verified     179108 non-null  bool  
 8   date              179108 non-null  object
 9   text              179108 non-null  object
 10  hashtags          127774 non-null  object
 11  source            179031 non-null  object
 12  is_retweet        179108 non-null  bool  
dtypes: bool(2), int64(3), object(8)
memory usage: 15.4+ MB


In [62]:
covid_data['country_name'] = covid_data['user_location'].str.split(',').str[-1]
covid_data['only_date'] = pd.to_datetime(covid_data['date']).dt.date

In [63]:
user_analysis = pd.DataFrame(covid_data['user_name'].value_counts().sort_values(ascending=False))
user_analysis = user_analysis.rename(columns={'user_name':'count'})

trace = go.Bar(x = user_analysis.index[:15],
              y = user_analysis['count'][:15],
              marker = dict(color='rgba(255,155,128,0.5)',
              line = dict(color='rgb(0,0,0)', width=1.5)))

layout = go.Layout(title="Top 15 user by no. of tweets",
                  xaxis=dict(title='User Name',zeroline= False,
                         gridcolor='rgb(183,183,183)',showline=True),
                  yaxis=dict(title='Frequency of tweets',zeroline= False,
                            gridcolor='rgb(183,183,183)',showline=True),
                  font=dict(family='Courier New, monospace', size=12, color='rgb(0,0,0)')
)
data = [trace]
fig = go.Figure(data = data, layout = layout)
iplot(fig)

In [64]:
data = {
   "values": location_analysis['count'][:15],
   "labels": location_analysis.index[:15],
   "domain": {"column": 0},
   "name": "Location Name",
   "hoverinfo":"label+percent+name",
   "hole": .4,
   "type": "pie"
}
layout = go.Layout(
   {
      "title":"Location Ratio",
}
)

data = [data]
fig = go.Figure(data = data, layout = layout)
iplot(fig)


In [65]:
tweet_analysis = pd.DataFrame(covid_data['only_date'].value_counts())
tweet_analysis = tweet_analysis.rename(columns={'only_date':'count'})

trace = go.Bar(x = tweet_analysis.index,
              y = tweet_analysis['count'],
              marker = dict(color='rgba(150, 200, 100, 0.5)',
              line = dict(color='rgb(0,0,0)', width=1.5)))

layout = go.Layout(barmode='group',
                  title="Date wise no. of tweets",
                  xaxis=dict(title='Date',zeroline= False,
                         gridcolor='rgb(183,183,183)',showline=True),
                  yaxis=dict(title='Frequency of tweets',zeroline= False,
                            gridcolor='rgb(183,183,183)',showline=True),
                  font=dict(family='Courier New, monospace', size=12, color='rgb(0,0,0)')
)
data = [trace]
fig = go.Figure(data = data, layout = layout)
iplot(fig)

In [66]:
# top source
source_analysis = pd.DataFrame(covid_data['source'].value_counts().sort_values(ascending=False))
source_analysis = source_analysis.rename(columns={'source':'count'})

trace = go.Bar(x = source_analysis.index[:10],
              y = source_analysis['count'][:10],
              marker = dict(color='rgba(150, 125, 180, 0.5)',
              line = dict(color='rgb(0,0,0)', width=1.5)))

layout = go.Layout(title="Top 10 Sources by no. of tweets",
                  xaxis=dict(title='Source Name',zeroline= False,
                         gridcolor='rgb(183,183,183)',showline=True),
                  yaxis=dict(title='Frequency of tweets',zeroline= False,
                            gridcolor='rgb(183,183,183)',showline=True),
                  font=dict(family='Courier New, monospace', size=12, color='rgb(0,0,0)')
)
data = [trace]
fig = go.Figure(data = data, layout = layout)
iplot(fig)

In [67]:
data = {
   "values": source_analysis['count'][:15],
   "labels": source_analysis.index[:15],
   "domain": {"column": 0},
   "name": "Source Name",
   "hoverinfo":"label+percent+name",
   "hole": .4,
   "type": "pie"
}
layout = go.Layout(
   {
      "title":"Source Ratio of Top 15 sources",
}
)
data = [data]
fig = go.Figure(data = data, layout = layout)
fig.update_layout(
    autosize=False,
    width=1200,
    height=700,)
iplot(fig)

In [68]:
def wordcloud(string):
    wc = WordCloud(width=800,height=500,mask=None,random_state=21, max_font_size=110,stopwords=stop_words).generate(string)
    fig=plt.figure(figsize=(16,8))
    plt.axis('off')
    plt.imshow(wc)

In [69]:
stop_words=set(STOPWORDS)
country_string = " ".join(covid_data['country_name'].astype('str'))
source_string = " ".join(covid_data['source'].astype('str'))
text_string = " ".join(covid_data['text'])
description_string = " ".join(covid_data['user_description'].astype('str'))
hastage_string = " ".join(covid_data['hashtags'].astype('str'))
location_string = " ".join(covid_data['user_location'].astype('str'))

In [70]:
sentiment_data = pd.read_csv('/content/finalSentimentdata2 - Copy.csv')

In [71]:
sentiment_data.head()

Unnamed: 0,number,sentiment,text
0,3204,sad,agree the poor in india are treated badly thei...
1,1431,joy,if only i could have spent the with this cutie...
2,654,joy,will nature conservation remain a priority in ...
3,2530,sad,coronavirus disappearing in italy show this to...
4,2296,sad,uk records lowest daily virus death toll since...


In [72]:
sentiment_data['sentiment'].unique()

array(['sad', 'joy', 'fear', 'anger'], dtype=object)

In [73]:
import re
import string
def remove_punc(text):
    # Dealing with Punctuation
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

sentiment_data['text'] = sentiment_data['text'].apply(remove_punc)

In [74]:
from nltk import stem
from nltk.corpus import stopwords
stemmer = stem.SnowballStemmer('english')
stopwords = set(stopwords.words('english'))

def alternative_review_messages(msg):
    # converting messages to lowercase
    msg = msg.lower()
    # removing stopwords
    msg = [word for word in msg.split() if word not in stopwords]
    # using a stemmer
    msg = " ".join([stemmer.stem(word) for word in msg])
    return msg

sentiment_data['text'] = sentiment_data['text'].apply(alternative_review_messages)

In [75]:
SEED = 2000
x_train, x_validation, y_train, y_validation = train_test_split(sentiment_data['text'], sentiment_data['sentiment'],
                                                                test_size=.2, random_state=SEED)

In [76]:
from time import time
def prediction(pipeline, x_train, y_train,testtext):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(testtext)
    return y_pred

In [77]:
 from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier

vectorizer=TfidfVectorizer()
checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', RidgeClassifier())
        ])
vectorizer.set_params(stop_words=None, max_features=10000, ngram_range=(1,4))
prediction=prediction(checker_pipeline,x_train, y_train,x_validation)

In [78]:
from sklearn.metrics import accuracy_score
def acc_summary(pipeline, x_train, y_train, x_test, y_test):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print("train and test time: {0:.2f}s".format(train_test_time))
    print("-"*80)
    return accuracy, train_test_time
clf_acc = acc_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)

accuracy score: 68.28%
train and test time: 0.65s
--------------------------------------------------------------------------------


In [79]:
from sklearn.svm import SVC
def prediction2(pipeline, x_train, y_train,testtext):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(testtext)
    return y_pred
checker_pipeline2 = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', SVC(C=1000))
        ])
vectorizer.set_params(stop_words=None, max_features=10000, ngram_range=(1,4))
prediction=prediction2(checker_pipeline2,x_train, y_train,x_validation)

In [80]:
clf_acc = acc_summary(checker_pipeline2, x_train, y_train, x_validation, y_validation)

accuracy score: 69.58%
train and test time: 1.90s
--------------------------------------------------------------------------------
