# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import io

In [4]:
import neattext.functions as nfx                       # for cleaning the text
#from textblob import TextBlob


# Natural language toolkit
from nltk.stem import PorterStemmer                          
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import SnowballStemmer

# For extraction of features
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA


# For labelling the data
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

# For sentiment analysis depends on words 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [5]:
# For model
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

import joblib

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

### Get the english tweets

In [2]:
tweets = pd.read_csv("small_dataset/English_dataset.csv")

In [5]:
for i in tweets.sample(3).Text:
    print(i)

All 4 suspected cases of Covid19 in Kitui, test -ve.
Though our #Hospitality industry is suffering right now, the takeaway sector is really busy.
This course could set you apart from the rest as it includes a Level 2 qualification! 👉 https://t.co/A1I4vfLr0K @JobSkilla #InspiraForLife #OnlineCourses #Covid19
Downward pressure on #FDI flows caused by the #coronavirus pandemic could range from -30% to -40% in 2020-21, significantly grimmer than previous projections and possibly worse than during the 2008 financial crisis, according to a report from 
@UNCTAD. (File Photo) https://t.co/X41fvvaBtM


#### Fill the **NAN** value to blank space or " "

In [4]:
tweets = tweets.fillna(" ")

# Cleaning and processing tweets

- Removing hashtag 
- Remove userhandles @
- Removing Urls
- Removing stopwords
- Removing multiples spaces and special_characters
- Removing numbers and dates
- Convert into lowercase

In [5]:
def process_tweets(tweets):
    # removing retweets rt word and hashtag only
    #remove_rt = lambda x : re.sub(r'^RT[\s]+',"",x)
    #tweets = tweets.map(remove_rt)
    remove_hashtag = lambda x : re.sub(r'#', '', x)
    tweets = tweets.map(remove_hashtag)
    #tweets = tweets.drop_duplicates(keep=False)                #removing duplicates
    tweets = tweets.apply(nfx.remove_userhandles)              #removing @
    tweets = tweets.apply(nfx.remove_urls)                     #removing urls
    tweets = tweets.apply(nfx.remove_stopwords)                #removing and the are etc.
    tweets = tweets.apply(nfx.remove_special_characters)       #removing !^&#@($|)                #removing punc
    tweets = tweets.apply(nfx.remove_multiple_spaces)          #removing multple_spaces
    tweets = tweets.apply(nfx.remove_currencies)
    tweets = tweets.apply(nfx.remove_numbers)
    tweets = tweets.apply(nfx.remove_dates)
    #tweets = tweets.apply(nfx.remove_emojis)
    tweets = tweets.str.lower()
    return tweets

In [6]:
tweets["Clean_text"] = process_tweets(tweets["Text"])

#### Saving the clean Text

In [7]:
#tweets.to_csv("en_clean_dataset.csv")

# Label the Dataset

- Get the Features from Tfidfvectorizer
- Then Cluster the tweets into 3 groups
- Then get the sentiments of group to find out which group is positive, negative and neutral
- Label the each tweet by sentiment analyzer and their cluster value'

In [90]:
vec1 = TfidfVectorizer(min_df=5, max_df = 0.95, sublinear_tf=True, use_idf =True,max_features=1500)

In [91]:
X_train = vec1.fit_transform(tweets["Clean_text"])

In [92]:
X_train = X_train.toarray()

In [93]:
pd.DataFrame(X_train,columns = vec1.get_feature_names()).sample(10)

Unnamed: 0,able,about,absolutely,access,according,account,accountable,act,action,actions,...,yemenis,yes,yesterday,yet,york,you,young,youre,youtube,zero
322549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
331204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
368119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296804,0.0,0.0
16955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.525723,0.0,0.0,0.0,0.0


## clustering 

In [94]:
km = MiniBatchKMeans(n_clusters=3, init_size=4096, batch_size=4096,max_iter=300)

In [95]:
labels = km.fit_predict(X_train)

In [96]:
tweets["cluster_labels"] = labels

In [97]:
#tweets.sample(10).get(["Clean_text","cluster_labels"])

Unnamed: 0,Clean_text,cluster_labels
201327,trumplimit gatherings people march,0
147521,inside italys hospitals disturbing look corona...,1
227792,shoutouts competition,0
21800,britons fear stranded morocco coronavirus trav...,1
408160,failed america trumpliesaboutcoronavirus trump...,0
178735,people great things like numbers science truth...,0
266000,ga coronavirus numbers rise deaths cases,1
195026,im saying news trump hold rnc convention coron...,1
77829,ssot apparently managed convince entire souths...,2
97326,chris please shed light nyc la amp cities addr...,2


## finding sentiments of cluster

In [98]:
def determine_sentiment(texts):
    analyzer = SentimentIntensityAnalyzer()
    c = 0
    for text in texts:
        #c += 1
        #print(c)
        score = analyzer.polarity_scores(text)
        c += score["compound"]
    return c/len(texts)

In [99]:
scores = [0,0,0]
for i in range(3):
    score = determine_sentiment(tweets[tweets["cluster_labels"] == i].Clean_text)
    scores[i] = score
print(scores)

[0.046592035087515526, -0.027193529358306174, 0.07625511002717983]


In [100]:
for i,v in enumerate(scores):
    print("Cluster ",i,"=",v)

Cluster  0 = 0.046592035087515526
Cluster  1 = -0.027193529358306174
Cluster  2 = 0.07625511002717983


- Highest score gives **positive** sentiment
- Lowest scores gives **negative** sentiment
- Remaining cluster shows **Neutral** sentiment

#### Saving the clustered tweets

In [19]:
#tweets.to_csv("Clustered_tweets.csv")

## label the tweets by sentiment analyzer

In [20]:
tweets.shape

(414545, 6)

### Get the vaderSentiment of individual tweet and check there belonging cluser
- If they match then add them to the final training dataset

In [101]:
analyzer = SentimentIntensityAnalyzer()
final_table = []
for tweet,text,label in zip(tweets.Text,tweets.Clean_text,tweets.cluster_labels):
    score = analyzer.polarity_scores(text)
    if (score["compound"] > 0.5) and (label == 2) :              # text scores > 0.5 and belong to positive cluster
        final_table.append([tweet,text,"pos",1])                 # text scores < 0.5 and belong to positive cluster
    elif (score["compound"] < 0.5) and (label == 1) :
        final_table.append([tweet,text,"neg",-1])
    elif (score["compound"] <= 0.6) and (score["compound"] >= 0.4) and (label == 0) : # text scores <= 0.6 and >= 0.3
        final_table.append([tweet,text,"neutral",0])            #  belong to neutral cluster

In [102]:
df = pd.DataFrame(final_table,columns=["Tweets","Clean_tweets","labels","Result"])

## Counting the labels tweets

In [103]:
print("Positive tweets =",df[df["Result"]==1].shape[0])
print("Negative tweets =",df[df["Result"]==-1].shape[0])
print("Neutral tweets =",df[df["Result"]==0].shape[0])

Positive tweets = 26658
Negative tweets = 103037
Neutral tweets = 22556


In [104]:
print("Total remaing training datasize =",df.shape[0])

Total remaing training datasize = 152251


In [105]:
for i in df[df["labels"]=="pos"].sample(5).Tweets:
    print(i)
    print()

Eight major mobile carriers have agreed to share customer location data with the European Commission in a bid to track the spread of COVID-19, the GSMA said in a statement on Wednesday.
#lockdown #COVID19 
https://t.co/GqDJvgSOXX

It is important to stay active every day while at home over #COVID19 outbreak. @WHO recommends at least 30 minutes #PhysicalActivity for adults and 60 minutes for children. #ActiveChallenge2020 #BeActive https://t.co/oTw1bLXOEC

These are not usual times. 
Our 3D printers are ready to support. 

#COVID19 #Daimler https://t.co/f2uyzn67Jw

Really appreciated this insightful and deeply human story from @AngelaKingKUOW. #COVID19

https://t.co/SnjSyNNnwQ

WEDNESDAY PODCAST: @VancouverFdn's @KevinMcCort, family doctor @DrMZeineddin, BC Today tech contributor and Vancouver Support . ca developer @AWSamuel discuss how people have been helping others amid the #COVID19 outbreak and the need. https://t.co/WORY4RBuWC



In [106]:
#df.to_csv("labelled_tweets.csv")

# Implementing Model 
## Naive bayes

In [107]:
# set the size according to count of label
X = df[df["Result"] == -1][:60000]          #neg
X = X.append(df[df["Result"] == 1][:22000]) #pos
X = X.append(df[df["Result"] == 0][:22000]) #neutral

In [108]:
training_data = df.sample(frac=1)    # mixing the sentiment

In [109]:
x_train,x_test,y_train,y_test = train_test_split(training_data["Clean_tweets"],training_data["Result"], 
                                                 stratify=training_data["Result"], test_size=0.2, random_state=42)

In [3]:
df = pd.read_csv("labelled_tweets.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Tweets,Clean_tweets,labels,Result
0,0,"Such news from SCMP, \nso interesting https://...",news scmp interesting,neutral,0
1,1,Good move but should of been done a couple of ...,good couple weeks,neutral,0
2,2,I hope these ventilators get built somehow. ht...,hope ventilators built somehow,neutral,0
3,3,.@scroll_in has been reporting extensively on ...,reporting extensively coronaviruscrisis follo...,neutral,0
4,4,This is fn preposterous. The richest country i...,fn preposterous richest country world covid ge...,pos,1


getting x_train ,x_test features of their training

In [110]:
vec2 = CountVectorizer(max_features=2000)
x = vec2.fit_transform(x_train).toarray()
x_test = vec2.transform(x_test).toarray()

In [111]:
model = MultinomialNB()
model.fit(x, y_train)

MultinomialNB()

In [112]:
model.score(x_test, y_test)

0.8983613017634888

In [113]:
model.predict(vec2.transform(['I lost my job after the pandemic']))

array([-1], dtype=int64)

In [114]:
model.predict(vec2.transform(['I get my job after the pandemic i am happy']))

array([1], dtype=int64)

In [115]:
model.predict(vec2.transform(['I am sad']))

array([-1], dtype=int64)

In [116]:
model.predict(vec2.transform(['I am happy']))

array([1], dtype=int64)

In [117]:
model.predict(vec2.transform(['Getting food and working in life']))

array([0], dtype=int64)

# Saving the model

In [119]:
# saving count vectorizer 
joblib.dump(vec2, 'Model_Folder/vec.pkl')

['Model_Folder/vec.pkl']

In [120]:
# saving model
joblib.dump(model, 'Model_Folder/model.pkl')

['Model_Folder/model.pkl']