# Twitter Sentiment Analysis of Covid-19   
* Coronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus.   
* There are worldwide curfews, quarantines and lockdown established to prevent further spread of the virus.   
* The basic agenda for this project is to use the #tags and other twitter components to analyse the behaviour of the indian citizens towards the overall situation of the lockdown.


### Timeline of lockdown :
Phase 1 : 25 March – 14 April   
Phase 2 : 15 April – 3 May   
Phase 3 : 4 May – 17 May   
Phase 4 : 18 May – 31 May   
Phase 5 : 1 June – 30 June     

We will be analyzing the tweets on 16th April,2020 i.e a day after Phase-2 was declared.

### A simple web-app using Streamlit is deployed for displaying the visualizations. : https://covid19-sentiment-analysis.herokuapp.com/    

The source code and dataset for the same can be found here : https://github.com/kartik-mohan/Covid19-Sentiment-Analysis

# Importing Packages

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud,STOPWORDS
stopwords = set(STOPWORDS)

from textblob import TextBlob

import re

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading Data

In [None]:
# Reading data
df=pd.read_csv('/kaggle/input/coronavirus-covid19-tweets-late-april/2020-04-16 Coronavirus Tweets.CSV')
df.head()

In [None]:
# display columns
df.columns

# Cleaning Data

In [None]:
# dropping columns
tweet = df.copy()
tweet.drop(['status_id','user_id','screen_name','source','reply_to_status_id','reply_to_user_id','is_retweet','place_full_name','place_type','reply_to_screen_name','is_quote','followers_count','friends_count','account_lang','account_created_at','verified'],axis=1, inplace = True)
tweet.head()

In [None]:
# filtering data with 'country_code = IN' and 'language = en'
tweet =tweet[(tweet.country_code == "IN") & (tweet.lang == "en")].reset_index(drop = True)
tweet.drop(['country_code','lang'],axis=1,inplace=True)
tweet.head()

In [None]:
# created_at column
tweet["created_at"] = tweet["created_at"].apply(lambda i:(int(i.split("T")[1].split(":")[0])+int(i.split("T")[1].split(":")[1])/60))

In [None]:
# shape
tweet.shape

In [None]:
# check missing values
tweet.isna().sum()

In [None]:
# data preprocessing
for i in range(tweet.shape[0]) :
    tweet['text'][i] = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(#[A-Za-z0-9]+)", " ", tweet['text'][i]).split()).lower()
tweet['text'].head()

## Top 5 most favourited tweets:

In [None]:
fav = tweet[['favourites_count','text']].sort_values('favourites_count',ascending = False)[:5].reset_index()
for i in range(5):
    print(i,']', fav['text'][i],'\n')

## Top 5 most retweeted tweets:

In [None]:
retweet = tweet[['retweet_count','text']].sort_values('retweet_count',ascending = False)[:5].reset_index()
for i in range(5):
    print(i,']', retweet['text'][i],'\n')

# Number of Tweets/Hour

In [None]:
plt.figure(1, figsize=(10,6))
plt.hist(tweet["created_at"],bins = 24);
plt.xlabel('Hours',size = 15)
plt.ylabel('No. of Tweets',size = 15)
plt.title('No. of Tweets per Hour',size = 15)

# Word Cloud : 

In [None]:
def show_wordcloud(data , title = None):
    wordcloud = WordCloud(background_color='black',stopwords=stopwords,max_words=200,max_font_size=40).generate(str(data))
  
    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')
    plt.title(title, size = 25)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()

show_wordcloud(tweet['text'])

The following words can be seen: covid, fear, chinese, starving, strategy.

### Removing Stopwords

In [None]:
stopwords

In [None]:
#Removing Stop Words
tweet['text'] = tweet['text'].apply(lambda tweets: ' '.join([word for word in tweets.split() if word not in stopwords]))
tweet['text'].head() 

## Analyzing Text for Sentiment

#### Analyzing text using TextBlob to predict the sentiment of the text and categorise it as 'Positive', 'Negative' or 'Neutral'.

In [None]:
tweet['sentiment'] = ' '
tweet['polarity'] = None
for i,tweets in enumerate(tweet.text) :
    blob = TextBlob(tweets)
    tweet['polarity'][i] = blob.sentiment.polarity
    if blob.sentiment.polarity > 0 :
        tweet['sentiment'][i] = 'positive'
    elif blob.sentiment.polarity < 0 :
        tweet['sentiment'][i] = 'negative'
    else :
        tweet['sentiment'][i] = 'neutral'
tweet.head()

In [None]:
print(tweet.sentiment.value_counts())
sns.countplot(x='sentiment', data = tweet);

## Sentiment Distribution

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(tweet['polarity'], bins=30)
plt.title('Sentiment Distribution',size = 15)
plt.xlabel('Polarity',size = 15)
plt.ylabel('Frequency',size = 15)
plt.show();

## Using Word Clouds to see the higher fequency words from each sentiment

In [None]:
pos = tweet['text'][tweet['sentiment'] == 'positive']
show_wordcloud(pos , 'POSITIVE')

neg = tweet['text'][tweet['sentiment'] == 'negative']
show_wordcloud(neg , 'NEGATIVE')

neutral = tweet['text'][tweet['sentiment'] == 'neutral']
show_wordcloud(neutral , 'NEUTRAL')

In [None]:
count = pd.DataFrame(tweet.groupby('sentiment')['favourites_count'].sum())
count.head()

# Most frequently appearing words

In [None]:
words = []
words = [word for i in tweet.text for word in i.split()]

In [None]:
freq = Counter(words).most_common(30)
freq = pd.DataFrame(freq)
freq.columns = ['word', 'frequency']
freq.head()

In [None]:
plt.figure(figsize = (10, 10))
sns.barplot(y="word", x="frequency",data=freq);

In [None]:
tweet.to_csv('tweet.csv',index=False)

# Conclusion  
We can conclude that mostly people have a positive and neutral sentiment towards the start of Lockdown-2.

In [None]:
#Big data project start
#Analysis of United Sates US 
#Analysis of Canada CN

In [None]:
#to view country codes needed first start by making a copy of the datset then dropping columns
country_view = df.copy()
country_view.drop(['status_id','user_id','screen_name','source','reply_to_status_id','reply_to_user_id','is_retweet','place_full_name','place_type','reply_to_screen_name','is_quote','followers_count','friends_count','account_lang','account_created_at','verified'],axis=1, inplace = True)
country_view.head()


In [None]:
#to view country codes needed
country_view = country_view.dropna()
country_view.head()

In [None]:
#update stop words for better word cloud data 
stopwords.update(["https", "name", "dtype", "text", "she", "whether", "ft", "in"])
stopwords

In [None]:
#create a new dataset that houses all data for US 
us_dataset = pd.DataFrame(df[(df.country_code == "US") & (df.lang == "en")])
us_dataset.to_csv('us_data.csv')

In [None]:
#Create a new dataset that houses all data for CN 
cn_dataset = pd.DataFrame(df[(df.country_code == "CN") & (df.lang == "en")])
cn_dataset.to_csv('cn_data.csv')

In [None]:
#view contents of us_data file 
us_dataset = pd.read_csv('./us_data.csv')
us_dataset

In [None]:
us_dataset.shape

In [None]:
# Making a copy of the dataset and dropping columns from us_dataset 
us_tweet = us_dataset.copy()
us_tweet.drop(['status_id','user_id','screen_name','source','reply_to_status_id','reply_to_user_id','is_retweet','place_full_name','place_type','reply_to_screen_name','is_quote','followers_count','friends_count','account_lang','account_created_at','verified'],axis=1, inplace = True)
us_tweet.head()

In [None]:
us_tweet.shape

In [None]:
# data preprocessing to make text uniform 
for i in range(us_tweet.shape[0]) :
    us_tweet['text'][i] = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(#[A-Za-z0-9]+)", " ", us_tweet['text'][i]).split()).lower()
us_tweet['text'].head()

In [None]:
#Removing Stop Words
us_tweet['text'] = us_tweet['text'].apply(lambda tweets: ' '.join([word for word in tweets.split() if word not in stopwords]))
us_tweet['text'].head() 

In [None]:
#first word cloud showing data without sentiment 
def show_wordcloud(data , title = None):
    
    wordcloud = WordCloud(background_color='black',stopwords=stopwords,max_words=200,max_font_size=40).generate(str(data))
  
    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')
    plt.title(title, size = 25)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()

show_wordcloud(us_tweet['text'])


In [None]:
#sentiment analysis of positive negative and neutral on us_tweet dataset
us_tweet['sentiment'] = ' '
us_tweet['polarity'] = None
for i,tweets in enumerate(us_tweet.text) :
    blob = TextBlob(tweets)
    us_tweet['polarity'][i] = blob.sentiment.polarity
    if blob.sentiment.polarity > 0 :
        us_tweet['sentiment'][i] = 'positive'
    elif blob.sentiment.polarity < 0 :
        us_tweet['sentiment'][i] = 'negative'
    else :
        us_tweet['sentiment'][i] = 'neutral'
us_tweet.head()

In [None]:
#chart representation of sentiment for US
print(us_tweet.sentiment.value_counts())
sns.countplot(x='sentiment', data = us_tweet);

In [None]:
# word cloud representation of sentiment analysis for US 
pos = us_tweet['text'][us_tweet['sentiment'] == 'positive']
show_wordcloud(pos , 'POSITIVE')

neg = us_tweet['text'][us_tweet['sentiment'] == 'negative']
show_wordcloud(neg , 'NEGATIVE')

neutral = us_tweet['text'][us_tweet['sentiment'] == 'neutral']
show_wordcloud(neutral , 'NEUTRAL')