# Twitter sentiment analysis using nltk package and naive bayes model 
[related to Covid-19 (2020-07-24 to 2020-08-30)]
------------------

### Importing the Modules

In [None]:
import pandas as pd 
import numpy as np 
from IPython.display import display

import matplotlib.pyplot as plt 
import re
import string

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
nltk.download('stopwords')
nltk.download('vader_lexicon')


from collections import Counter

from matplotlib import pyplot as plt
from matplotlib import ticker
import seaborn as sns
import plotly.express as px

sns.set(style="darkgrid")

### Importing the Dataset

In [None]:
df=pd.read_csv('../input/covid19-tweets/covid19_tweets.csv')
df.head(5)

let's check the shape of the dataframe

In [None]:
df.shape

let's select the needed columns for our project

In [None]:
needed_columns=['user_name','date','text']
df=df[needed_columns]
df.head()

change the type of some columns

In [None]:
df.user_name=df.user_name.astype('category')
df.user_name=df.user_name.cat.codes # assign a unique numerical code to each category
df.date=pd.to_datetime(df.date).dt.date

In [None]:
df.head(5)

### Picking out the tweet texts

In [None]:
texts=df.text
texts

### Removing URLs from tweets

In [None]:
remove_url=lambda x:re.sub(r'http\S+','',str(x))
texts_lr=texts.apply(remove_url)
texts_lr

### Converting all tweets to lowercase

In [None]:
to_lower=lambda x: x.lower()
texts_lr_lc=texts_lr.apply(to_lower)
texts_lr_lc

### Removing punctuations

In [None]:
remove_puncs= lambda x:x.translate(str.maketrans('','',string.punctuation))
texts_lr_lc_np=texts_lr_lc.apply(remove_puncs)
texts_lr_lc_np

### Removing stopwords

In [None]:
more_words=['say','going','like','U','u','#coronavirus', '#coronavirusoutbreak', '#coronavirusPandemic', '#covid19', '#covid_19','coronavirus', 'covid19']
stop_words=set(stopwords.words('english')) #nltk package
stop_words.update(more_words)

remove_words=lambda x: ' '.join([word for word in x.split() if word not in stop_words]) #.join is from package string
texts_lr_lc_np_ns=r=texts_lr_lc_np.apply(remove_words)
texts_lr_lc_np_ns

### let's create a big list of words out of all the tweets 

In [None]:
words_list=[word for line in texts_lr_lc_np_ns for word in line.split()]
words_list[:5]

In [None]:
word_counts=Counter(words_list).most_common(50)
word_df=pd.DataFrame(word_counts)
word_df.columns=['word','frq']
display(word_df.head(5))
# px=import plotly.express
px.bar(word_df,x='word',y='frq',title='Most common words')

### put the Cleaned text in main dataframe

In [None]:
display(df.head(5))
df.text=texts_lr_lc_np_ns
display(df.head(5))

### addtional clean

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
df['text'] = df['text'].apply(lambda x: clean_text(x))
display(df)

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


In [None]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))
display(df)

# Sentiment Analysis 

Getting the polarity scores for each tweet

In [None]:
sid=SentimentIntensityAnalyzer()
ps=lambda x:sid.polarity_scores(x)
sentiment_scores=df.text.apply(ps)
sentiment_scores

In [None]:
sentiment_df=pd.DataFrame(data=list(sentiment_scores))
display(sentiment_df)

### Labeling the scores based on the compound polarity value

In [None]:
labelize=lambda x:'neutral' if x==0 else('positive' if x>0 else 'negative')
sentiment_df['label']=sentiment_df.compound.apply(labelize)
display(sentiment_df.head(10))

### let's join two dataframes

In [None]:
display(df.head(5))
data=df.join(sentiment_df.label)
display(data.head(5))

### Plotting the sentiment score counts

In [None]:
counts_df=data.label.value_counts().reset_index()
display(counts_df)

In [None]:
plt.figure(figsize=(8,5)) 
sns.barplot(x='index',y='label',data=counts_df)

# Naive Bayes 

In [None]:
tweets_df=data[['label','text']]
tweets_df['length']=tweets_df['text'].apply(len)
tweets_df

In [None]:
tweets_df.describe()

In [None]:
# hist plot for the length of tweets
tweets_df['length'].plot(bins=100,kind='hist')

In [None]:
positive=tweets_df[tweets_df.label=='positive']
negative=tweets_df[tweets_df.label=='negative']

In [None]:
positive.head()

In [None]:
sentence_as_one_string = " ".join(tweets_df.text)

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(30,30))
plt.imshow(WordCloud().generate(sentence_as_one_string))

In [None]:
# negative wordcloud
negative_list=negative.text.tolist()
negative_as_one_string = " ".join(negative_list)

plt.figure(figsize=(30,30))
plt.imshow(WordCloud().generate(negative_as_one_string))


# Count Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer= CountVectorizer()

sample_data=tweets_df.text[0:3]
X=vectorizer.fit_transform(sample_data)

In [None]:
print(vectorizer.get_feature_names())
print(X.toarray())

# Pipeline to remove punc,wtop-words and tokenization and count vectorizer

In [None]:
def message_cleaning(message):
    punc_removed=[char for char in message if char not in string.punctuation]
    punc_removed_join=''.join(punc_removed)
    punc_removed_join_clean=[word for word in punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    return punc_removed_join_clean

In [None]:
tweets_df_clean=tweets_df.text.apply(message_cleaning)

In [None]:
print(tweets_df_clean[15])

In [None]:
vectorizer=CountVectorizer(analyzer=message_cleaning)
tweets_countvectorizer=CountVectorizer(analyzer=message_cleaning,dtype='uint8').fit_transform(tweets_df.text).toarray()

In [None]:
tweets_countvectorizer.shape

In [None]:
X=tweets_countvectorizer
y=tweets_df.label

# Build a Naive Bayes Classifier

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1)

In [None]:
from sklearn.naive_bayes import MultinomialNB
NB_classifier=MultinomialNB()
NB_classifier.fit(X_train,y_train)

# NB Performance

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
labels = ['negative', 'neutral','positive']
y_predict_test=NB_classifier.predict(X_test)
cm=confusion_matrix(y_test,y_predict_test,labels)
sns.heatmap(cm,annot=True)
print(cm)

In [None]:
import collections
collections.Counter(y_test)

In [None]:
print(classification_report(y_test,y_predict_test))

### group number of counts by
<li>date
<li>positive,neutral,negative

In [None]:
data_agg=data[['user_name','date','label']]
display(data_agg.head(5))

In [None]:
data_agg=data_agg.groupby(['date','label'])
display(data_agg.head(5))

In [None]:
data_agg=data_agg.count()
display(data_agg.head(5))

In [None]:
data_agg=data_agg.reset_index()
display(data_agg.head(5))

### actually the 'user_name' is the count of users, so need to change the column name

In [None]:
data_agg.columns=['date','label','counts']
display(data_agg.head())

In [None]:
px.line(data_agg,x='date',y='counts',color='label',
       title='Daily Tweet Sentimental Analysis')

In [None]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))
display(df)