In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


**English Stopwords**

In [2]:
import nltk
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

**Load the Dataset**

In [3]:
data=pd.read_csv("../input/twitter-sentimental-analysis/bdaproj.csv",encoding='latin-1')

data.columns = ['sentiment','id','date','flag','user','tweet']
data.sentiment = data.sentiment.map({4:1,0:0})

In [4]:
data

**Remove duplicate tweets**

In [5]:
init_count = len(data)
tweet_unique = data.tweet.drop_duplicates(keep=False)
records_with_same_tweet = data[True ^ data.tweet.isin(tweet_unique)]
records_with_same_tweet[['sentiment', 'user', 'tweet']]
data=data[True ^ data.tweet.isin(records_with_same_tweet.tweet)]
curr_count = len(data)
print(f"{init_count - curr_count} duplicate rows dropped!\nCurrent row count: {curr_count}\n\n")

data.sentiment.map({1:"Positive(1)",0:"Negative(0)"}).value_counts().plot(kind='bar', color=['green', 'red'])
plt.title("Data distribution", fontdict={"fontsize": 20})
plt.show()

data

**Drop Unwanted Columns**

In [6]:
data.head(5)

In [7]:
data.tail(5)

**Preprocessing the Tweet**

In [8]:
#changing into lowercase

data["tweet"]=data["tweet"].str.lower()
data.head(5)

In [9]:
import seaborn as sns
sns.heatmap(data.isnull())

**Remove Noise**

In [10]:
# abbreviation check list
abbreviations = {
        "$" : " dollar ",
        "€" : " euro ",
        "4ao" : "for adults only",
        "a.m" : "before midday",
        "a3" : "anytime anywhere anyplace",
        "aamof" : "as a matter of fact",
        "acct" : "account",
        "adih" : "another day in hell",
        "afaic" : "as far as i am concerned",
        "afaict" : "as far as i can tell",
        "afaik" : "as far as i know",
        "afair" : "as far as i remember",
        "afk" : "away from keyboard",
        "app" : "application",
        "approx" : "approximately",
        "apps" : "applications",
        "asap" : "as soon as possible",
        "asl" : "age, sex, location",
        "atk" : "at the keyboard",
        "ave." : "avenue",
        "aymm" : "are you my mother",
        "ayor" : "at your own risk", 
        "b&b" : "bed and breakfast",
        "b+b" : "bed and breakfast",
        "b.c" : "before christ",
        "b2b" : "business to business",
        "b2c" : "business to customer",
        "b4" : "before",
        "b4n" : "bye for now",
        "b@u" : "back at you",
        "bae" : "before anyone else",
        "bak" : "back at keyboard",
        "bbbg" : "bye bye be good",
        "bbc" : "british broadcasting corporation",
        "bbias" : "be back in a second",
        "bbl" : "be back later",
        "bbs" : "be back soon",
        "be4" : "before",
        "bfn" : "bye for now",
        "blvd" : "boulevard",
        "bout" : "about",
        "brb" : "be right back",
        "bros" : "brothers",
        "brt" : "be right there",
        "bsaaw" : "big smile and a wink",
        "btw" : "by the way",
        "bwl" : "bursting with laughter",
        "c/o" : "care of",
        "cet" : "central european time",
        "cf" : "compare",
        "cia" : "central intelligence agency",
        "csl" : "can not stop laughing",
        "cu" : "see you",
        "cul8r" : "see you later",
        "cv" : "curriculum vitae",
        "cwot" : "complete waste of time",
        "cya" : "see you",
        "cyt" : "see you tomorrow",
        "dae" : "does anyone else",
        "dbmib" : "do not bother me i am busy",
        "diy" : "do it yourself",
        "dm" : "direct message",
        "dwh" : "during work hours",
        "e123" : "easy as one two three",
        "eet" : "eastern european time",
        "eg" : "example",
        "embm" : "early morning business meeting",
        "encl" : "enclosed",
        "encl." : "enclosed",
        "etc" : "and so on",
        "faq" : "frequently asked questions",
        "fawc" : "for anyone who cares",
        "fb" : "facebook",
        "fc" : "fingers crossed",
        "fig" : "figure",
        "fimh" : "forever in my heart", 
        "ft." : "feet",
        "ft" : "featuring",
        "ftl" : "for the loss",
        "ftw" : "for the win",
        "fwiw" : "for what it is worth",
        "fyi" : "for your information",
        "g9" : "genius",
        "gahoy" : "get a hold of yourself",
        "gal" : "get a life",
        "gcse" : "general certificate of secondary education",
        "gfn" : "gone for now",
        "gg" : "good game",
        "gl" : "good luck",
        "glhf" : "good luck have fun",
        "gmt" : "greenwich mean time",
        "gmta" : "great minds think alike",
        "gn" : "good night",
        "g.o.a.t" : "greatest of all time",
        "goat" : "greatest of all time",
        "goi" : "get over it",
        "gps" : "global positioning system",
        "gr8" : "great",
        "gratz" : "congratulations",
        "gyal" : "girl",
        "h&c" : "hot and cold",
        "hp" : "horsepower",
        "hr" : "hour",
        "hrh" : "his royal highness",
        "ht" : "height",
        "ibrb" : "i will be right back",
        "ic" : "i see",
        "icq" : "i seek you",
        "icymi" : "in case you missed it",
        "idc" : "i do not care",
        "idgadf" : "i do not give a damn fuck",
        "idgaf" : "i do not give a fuck",
        "idk" : "i do not know",
        "ie" : "that is",
        "i.e" : "that is",
        "ifyp" : "i feel your pain",
        "ig" : "instagram",
        "iirc" : "if i remember correctly",
        "ilu" : "i love you",
        "ily" : "i love you",
        "imho" : "in my humble opinion",
        "imo" : "in my opinion",
        "imu" : "i miss you",
        "iow" : "in other words",
        "irl" : "in real life",
        "j4f" : "just for fun",
        "jic" : "just in case",
        "jk" : "just kidding",
        "jsyk" : "just so you know",
        "l8r" : "later",
        "lb" : "pound",
        "lbs" : "pounds",
        "ldr" : "long distance relationship",
        "lmao" : "laugh my ass off",
        "lmfao" : "laugh my fucking ass off",
        "lol" : "laughing out loud",
        "ltd" : "limited",
        "ltns" : "long time no see",
        "m8" : "mate",
        "mf" : "motherfucker",
        "mfs" : "motherfuckers",
        "mfw" : "my face when",
        "mofo" : "motherfucker",
        "mph" : "miles per hour",
        "mr" : "mister",
        "mrw" : "my reaction when",
        "ms" : "miss",
        "mte" : "my thoughts exactly",
        "nagi" : "not a good idea",
        "nbc" : "national broadcasting company",
        "nbd" : "not big deal",
        "nfs" : "not for sale",
        "ngl" : "not going to lie",
        "nhs" : "national health service",
        "nrn" : "no reply necessary",
        "nsfl" : "not safe for life",
        "nsfw" : "not safe for work",
        "nth" : "nice to have",
        "nvr" : "never",
        "nyc" : "new york city",
        "oc" : "original content",
        "og" : "original",
        "ohp" : "overhead projector",
        "oic" : "oh i see",
        "omdb" : "over my dead body",
        "omg" : "oh my god",
        "omw" : "on my way",
        "p.a" : "per annum",
        "p.m" : "after midday",
        "pm" : "prime minister",
        "poc" : "people of color",
        "pov" : "point of view",
        "pp" : "pages",
        "ppl" : "people",
        "prw" : "parents are watching",
        "ps" : "postscript",
        "pt" : "point",
        "ptb" : "please text back",
        "pto" : "please turn over",
        "qpsa" : "what happens", 
        "ratchet" : "rude",
        "rbtl" : "read between the lines",
        "rlrt" : "real life retweet", 
        "rofl" : "rolling on the floor laughing",
        "roflol" : "rolling on the floor laughing out loud",
        "rotflmao" : "rolling on the floor laughing my ass off",
        "rt" : "retweet",
        "ruok" : "are you ok",
        "sfw" : "safe for work",
        "sk8" : "skate",
        "smh" : "shake my head",
        "sq" : "square",
        "srsly" : "seriously", 
        "ssdd" : "same stuff different day",
        "tbh" : "to be honest",
        "tbs" : "tablespooful",
        "tbsp" : "tablespooful",
        "tfw" : "that feeling when",
        "thks" : "thank you",
        "tho" : "though",
        "thx" : "thank you",
        "tia" : "thanks in advance",
        "til" : "today i learned",
        "tl;dr" : "too long i did not read",
        "tldr" : "too long i did not read",
        "tmb" : "tweet me back",
        "tntl" : "trying not to laugh",
        "ttyl" : "talk to you later",
        "u" : "you",
        "u2" : "you too",
        "u4e" : "yours for ever",
        "utc" : "coordinated universal time",
        "w/" : "with",
        "w/o" : "without",
        "w8" : "wait",
        "wassup" : "what is up",
        "wb" : "welcome back",
        "wtf" : "what the fuck",
        "wtg" : "way to go",
        "wtpa" : "where the party at",
        "wuf" : "where are you from",
        "wuzup" : "what is up",
        "wywh" : "wish you were here",
        "yd" : "yard",
        "ygtr" : "you got that right",
        "ynk" : "you never know",
        "zzz" : "sleeping bored and tired"
    }

def decontract(text):
    # remove special chars
    text = re.sub(r"\x89Û_", "", text)
    text = re.sub(r"\x89ÛÒ", "", text)
    text = re.sub(r"\x89ÛÓ", "", text)
    text = re.sub(r"\x89ÛÏ", "", text)
    text = re.sub(r"\x89Û÷", "", text)
    text = re.sub(r"\x89Ûª", "", text)
    text = re.sub(r"\x89Û\x9d", "", text)
    text = re.sub(r"å_", "", text)
    text = re.sub(r"\x89Û¢", "", text)
    text = re.sub(r"\x89Û¢åÊ", "", text)
    text = re.sub(r"åÊ", "", text)
    text = re.sub(r"åÈ", "", text)
    text = re.sub(r"Ì©", "e", text)
    text = re.sub(r"å¨", "", text)
    text = re.sub(r"åÇ", "", text)
    text = re.sub(r"åÀ", "", text)
    # remove contractions
    text = re.sub(r"let\x89Ûªs", "let us", text)
    text = re.sub(r"let's", "let us", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"we're", "we are", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"that\x89Ûªs", "that is", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"wont", "will not", text)
    text = re.sub(r"they're", "they are", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"cant", "cannot", text)
    text = re.sub(r"can\x89Ûªt", "cannot", text)
    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r"wasnt", "was not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"dont", "do not", text)
    text = re.sub(r"donå«t", "do not", text)  
    text = re.sub(r"don\x89Ûªt", "do not", text)
    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r"didnt", "did not", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r"hasn't", "has not", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"it\x89Ûªs", "it is", text)
    text = re.sub(r"you're", "you are", text)
    text = re.sub(r"you\x89Ûªre", "you are", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"i\x89Ûªm", "i am", text)
    text = re.sub(r"shoulda", "should have", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"wouldn't", "would not", text)
    text = re.sub(r"wouldn\x89Ûªt", "would not", text)
    text = re.sub(r"here's", "here is", text)
    text = re.sub(r"here\x89Ûªs", "here is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"you've", "you have", text)
    text = re.sub(r"you\x89Ûªve", "you have", text)
    text = re.sub(r"youve", "you have", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"we've", "we have", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"doesn\x89Ûªt", "does not", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"i've", "i have", text)
    text = re.sub(r"i\x89Ûªve", "i have", text)
    text = re.sub(r"y'all", "you all", text)
    text = re.sub(r"would've", "would have", text)
    text = re.sub(r"it'll", "it will", text)
    text = re.sub(r"we'll", "we will", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"weren't", "were not", text)
    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r"they'll", "they will", text)
    text = re.sub(r"they'd", "they would", text)
    text = re.sub(r"they've", "they have", text)
    text = re.sub(r"i'd", "i would", text)
    text = re.sub(r"I\x89Ûªd", "I would", text)
    text = re.sub(r"should've", "should have", text)
    text = re.sub(r"we'd", "we would", text)
    text = re.sub(r"i'll", "i will", text)
    text = re.sub(r"^ill$", "i will", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"you\x89Ûªll", "you will", text)    
    text = re.sub(r"ain't", "am not", text)    
    text = re.sub(r"you'd", "you would", text)
    text = re.sub(r"could've", "could have", text)
    text = re.sub(r"mÌ¼sica", "music", text)
    text = re.sub(r"some1", "someone", text)
    text = re.sub(r"yrs", "years", text)
    text = re.sub(r"hrs", "hours", text)
    text = re.sub(r"2morow|2moro", "tomorrow", text)
    text = re.sub(r"2day", "today", text)
    text = re.sub(r"4got|4gotten", "forget", text)
    text = re.sub(r"b-day|bday", "b-day", text)
    text = re.sub(r"mother's", "mother", text)
    text = re.sub(r"mom's", "mom", text)
    text = re.sub(r"dad's", "dad", text)
    text = re.sub(r"^[h|a]+$", "haha", text)
    text = re.sub(r"lmao|lolz|rofl", "lol", text)
    text = re.sub(r"thanx|thnx|thx", "thanks", text)
    text = re.sub(r'all[l]+', "all", text)
    text = re.sub(r'so[o]+', "so", text)
    text = re.sub(r'a[w]+', "awww", text)
    text = re.sub(r'why[y]+', "why", text)
    text = re.sub(r'way[y]+', "way", text)
    text = re.sub(r'will[l]+', "will", text)
    text = re.sub(r'oo[o]+h', "ooh", text)
    text = re.sub(r'hey[y]+', "hey", text)
    text = re.sub(r"boo[o]+m", "boom", text)
    text = re.sub(r"co[o]+ld", "cold", text)
    text = re.sub(r"goo[o]+d", "good", text)
    text = re.sub(r"luckigrrl", "lucky girl", text)
    text = re.sub(r"evolvin", "evolving", text)

    # specific
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"@", "" , text)         # removal of @
    text = re.sub(r"http\S+", "", text)   # removal of URLs
    text = re.sub(r"#", "", text)          # hashtag processing

    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)

    # deal with some abbreviations
    words = text.split()
    text = ' '.join([abbreviations[word] if word in abbreviations.keys() else word.strip(string.punctuation) for word in words])

    # character entity references
    text = re.sub(r"&gt;", ">", text)
    text = re.sub(r"&lt;", "<", text)
    text = re.sub(r"&amp;", "&", text)

    # typos, slang and informal abbreviations
    text = re.sub(r"w/e", "whatever", text)
    text = re.sub(r"usagov", "usa government", text)
    text = re.sub(r"<3", "love", text)
    text = re.sub(r"trfc", "traffic", text)
    
    # remove mentions
    text = re.sub(r'^@[0-9a-zA-Z_]+', "", text)

    # words with punctuations and special characters
    for punc in string.punctuation:
        text = text.replace(punc, '')

    # ... and ..
    text = text.replace('...', ' ... ')
    if '...' not in text:
        text = text.replace('..', ' ... ')

    return text

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

def lemmatize_data(text):
    lemmatizer=WordNetLemmatizer()
    lemma_words=[lemmatizer.lemmatize(word) for word in str(text).split()]
    return " ".join(lemma_words)

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

def remove_email(text):
    return re.sub('@[^\s]+', ' ', text)

def remove_URLs(text):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)

def remove_numbers(text):
    return re.sub('[0-9]+', '', text)

def clean_tweets(text):
    text = decontract(text)
    text = remove_stopwords(text)
    text = remove_email(text)
    text = remove_URLs(text)
    return text

init_rowcount = len(data)
data['tokens'] = data.tweet.apply(lambda tweet: clean_tweets(tweet))
data['length'] = data.tokens.str.len()
data = data[data.length != 0]
curr_rowcount = len(data)
avg_tweet_length = data.length.mode().iat[0]
min_tweet_length = data.length.min()
print(f"functions applied!\n\n{init_rowcount - curr_rowcount} rows with zero-length tweets dropped!\nMin tweet length observed: {min_tweet_length} words\nAvg tweet length observed: {avg_tweet_length} words")

In [11]:
data.head()

In [12]:
# data = data[['sentiment', 'tokens']]
# data.rename(columns={'tokens': 'tweet'}, inplace=True)
# data.to_csv()

In [13]:
data = data[['sentiment', 'tokens']]
data.rename(columns={'tokens': 'tweet'}, inplace=True)
data.head()

**Wordcloud by class (positive/Negative)**

In [14]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [15]:
plt.figure(figsize = (15,15)) 
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(data[data.sentiment == 1].tweet))
plt.imshow(wc , interpolation = 'bilinear')
plt.title('Tweets positifs')

In [16]:
plt.figure(figsize = (15,15)) 
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(data[data.sentiment == 0].tweet))
plt.imshow(wc , interpolation = 'bilinear')
plt.title('Tweets négatifs')

**Splitting dataset in train and test**

In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data['tweet'].values, data['sentiment'].values, test_size=0.25, random_state=1)
print("x_train : ", x_train.shape)
print("y_train : ", y_train.shape)
print("x_test : ", x_test.shape)
print("y_test : ", y_test.shape)

**Vectorization with CountVectorizer and classification**

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer=CountVectorizer()

countVec_train=count_vectorizer.fit_transform(x_train) 
countVec_test=count_vectorizer.transform(x_test)
print("count_train : ", countVec_train.shape)
print("count_test : ", countVec_test.shape)

**Random Forest**

In [19]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn import metrics
# import seaborn as sns


# rf_clf_countVec = RandomForestClassifier(n_estimators = 100)
# rf_clf_countVec = rf_clf_countVec.fit(countVec_train, y_train)
# y_pred = rf_clf_countVec.predict(countVec_test)

# print("Accuracy:",metrics.accuracy_score(y_test,y_pred))
# print("precision:",metrics.precision_score(y_test,y_pred))
# print("recall:",metrics.recall_score(y_test,y_pred))
# print("Confusion Matrix")
# sns.heatmap(metrics.confusion_matrix(y_test,y_pred),annot=True)

# from sklearn.neighbors import KNeighborsClassifier
# from sklearn import metrics
# import seaborn as sns

# np.random.seed(0)
# svm_clf_countVec = #svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# svm_clf_countVec = svm_clf_countVec.fit(countVec_train, y_train)
# y_pred = svm_clf_countVec.predict(countVec_test)

# print("Accuracy:",metrics.accuracy_score(y_test,y_pred))
# print("precision:",metrics.precision_score(y_test,y_pred))
# print("recall:",metrics.recall_score(y_test,y_pred))
# print("Confusion Matrix")
# sns.heatmap(metrics.confusion_matrix(y_test,y_pred),annot=True)

**MultinomialNB**

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import seaborn as sns


nb_clf_countVec =MultinomialNB()
nb_clf_countVec = nb_clf_countVec.fit(countVec_train, y_train)
y_pred_nb_clf_countVec = nb_clf_countVec.predict(countVec_test)

print("Accuracy:",metrics.accuracy_score(y_test,y_pred_nb_clf_countVec))
print("precision:",metrics.precision_score(y_test,y_pred_nb_clf_countVec))
print("recall:",metrics.recall_score(y_test,y_pred_nb_clf_countVec))
print("Confusion Matrix")
sns.heatmap(metrics.confusion_matrix(y_test,y_pred_nb_clf_countVec),annot=True)

**Logistic Regression**

In [21]:
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn import metrics

lr_clf_countVec = LogisticRegression(max_iter=100, solver='liblinear')
lr_clf_countVec = lr_clf_countVec.fit(countVec_train, y_train)
y_pred_lr_clf_countVec = lr_clf_countVec.predict(countVec_test)

print("Accuracy:",metrics.accuracy_score(y_test,y_pred_lr_clf_countVec))
print("precision:",metrics.precision_score(y_test,y_pred_lr_clf_countVec))
print("recall:",metrics.recall_score(y_test,y_pred_lr_clf_countVec))
print("Confusion Matrix")
sns.heatmap(metrics.confusion_matrix(y_test,y_pred_lr_clf_countVec),annot=True)

**Calculating False Positive Rate(fpr), True Positive Rate(tpr) , threshold for different models using roc curve**

In [22]:
from sklearn.metrics import roc_curve

fpr1, tpr1, thresh1 = roc_curve(y_test, y_pred_nb_clf_countVec, pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, y_pred_lr_clf_countVec, pos_label=1)

random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

**Calculating ROC scores to all models**

In [23]:
from sklearn.metrics import roc_auc_score
Roc=[]
Roc.append(roc_auc_score(y_test,y_pred_nb_clf_countVec ))
Roc.append(roc_auc_score(y_test, y_pred_lr_clf_countVec))

print(*Roc)

**Plotting all the ROC curves**

In [24]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')

plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='MultinomialNB')
plt.plot(fpr2, tpr2, linestyle='--',color='green', label='Logistic Regression')

plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')

plt.title('ROC curve')

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show();

**Predicting best model based on roc score**

In [25]:
if Roc.index(max(Roc))==0:
    print("MultinomialNB performs best")
else:
    print("LogisticRegression performs best")

**Vectorization with Tfidf (Term Frequency - Inverse Document) and classification**

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer=TfidfVectorizer()

tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)
print("tfidf_train : ", tfidf_train.shape)                                                                                                                                                                                                                                                    
print("tfidf_test : ", tfidf_test.shape)

**Random Forest**

In [27]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn import metrics
# import seaborn as sns

# rf_clf_tfidf = RandomForestClassifier(n_estimators = 100)
# rf_clf_tfidf = rf_clf_tfidf.fit(tfidf_train, y_train)
# y_pred = rf_clf_tfidf.predict(tfidf_test)

# print("Accuracy:",metrics.accuracy_score(y_test,y_pred))
# print("precision:",metrics.precision_score(y_test,y_pred))
# print("recall:",metrics.recall_score(y_test,y_pred))
# print("Confusion Matrix")
# sns.heatmap(metrics.confusion_matrix(y_test,y_pred),annot=True)

# from sklearn import svm
# from sklearn import metrics
# import seaborn as sns

# np.random.seed(0)
# svm_clf_tfidf = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# svm_clf_tfidf = svm_clf_tfidf.fit(tfidf_train, y_train)
# y_pred = svm_clf_tfidf.predict(tfidf_test)

# print("Accuracy:",metrics.accuracy_score(y_test,y_pred))
# print("precision:",metrics.precision_score(y_test,y_pred))
# print("recall:",metrics.recall_score(y_test,y_pred))
# print("Confusion Matrix")
# sns.heatmap(metrics.confusion_matrix(y_test,y_pred),annot=True)

**Multinomial NB**

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import seaborn as sns

nb_clf_tfidf = MultinomialNB()
nb_clf_tfidf = nb_clf_tfidf.fit(tfidf_train, y_train)
y_pred_nb_clf_tfidf = nb_clf_tfidf.predict(tfidf_test)

print("Accuracy:",metrics.accuracy_score(y_test,y_pred_nb_clf_tfidf))
print("precision:",metrics.precision_score(y_test,y_pred_nb_clf_tfidf))
print("recall:",metrics.recall_score(y_test,y_pred_nb_clf_tfidf))
print("Confusion Matrix")
sns.heatmap(metrics.confusion_matrix(y_test,y_pred_nb_clf_tfidf),annot=True)

**Logistic Regression**

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns

lr_clf_tfidf = LogisticRegression(max_iter=100, solver='liblinear')
lr_clf_tfidf = lr_clf_tfidf.fit(tfidf_train, y_train)
y_pred_lr_clf_tfidf = lr_clf_tfidf.predict(tfidf_test)

print("Accuracy:",metrics.accuracy_score(y_test,y_pred_lr_clf_tfidf))
print("precision:",metrics.precision_score(y_test,y_pred_lr_clf_tfidf))
print("recall:",metrics.recall_score(y_test,y_pred_lr_clf_tfidf))
print("Confusion Matrix")
sns.heatmap(metrics.confusion_matrix(y_test,y_pred_lr_clf_tfidf),annot=True)

**Calculating False Positive Rate(fpr), True Positive Rate(tpr) , threshold for different models using roc curve**

In [30]:
from sklearn.metrics import roc_curve

fpr1, tpr1, thresh1 = roc_curve(y_test, y_pred_nb_clf_tfidf, pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, y_pred_lr_clf_tfidf, pos_label=1)

random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

**Calculating ROC scores to all models**

In [31]:
from sklearn.metrics import roc_auc_score
Roc=[]
Roc.append(roc_auc_score(y_test,y_pred_nb_clf_tfidf ))
Roc.append(roc_auc_score(y_test, y_pred_lr_clf_tfidf))

print(*Roc)

**Plotting all the ROC curves**

In [32]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')

plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='MultinomialNB')
plt.plot(fpr2, tpr2, linestyle='--',color='green', label='Logistic Regression')

plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')

plt.title('ROC curve')

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show();

**Predicting best model based on roc score**

In [33]:
if(Roc.index(max(Roc)))==0:
    print("MultinomialNB performs best")
else:
    print("LogisticRegression performs best")

**Vectorization with Word2Vec trained with Gensim on this corpus**

In [34]:
! pip install --upgrade gensim
import gensim
print(gensim.__version__)

In [35]:
from gensim.models import Word2Vec

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        
        for sentence in data['tweet'].values:
            yield sentence.split()

sentences = MyCorpus()
# I eliminate tokens whose frequency is less than 10
trained_w2v = Word2Vec(sentences=sentences, min_count=10, vector_size=100)
print(trained_w2v)

In [36]:
from sklearn.decomposition import IncrementalPCA
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import random
np.random.seed(0)

def visualize_embeddings(wv):
    num_dimensions = 2

    vectors = np.asarray(wv.vectors)
    labels = np.asarray(wv.index_to_key)

    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    
    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

In [37]:
visualize_embeddings(trained_w2v.wv)

**Sentences embeddings**

In [38]:
def w2v_of_sentences(sentences, wv):
    vectors = np.zeros((len(sentences), wv.vector_size))
    for i, sentence in enumerate(sentences):
        sum_vector = np.zeros((1, wv.vector_size))
        tokens = sentence.split()
        for token in tokens:
            try:
              sum_vector += wv[token]
            except:
              pass
        if len(tokens) > 0:
          vectors[i] = sum_vector / len(tokens)
        else:
          vectors[i] = sum_vector
    return vectors

In [39]:
w2v_train=w2v_of_sentences(x_train, trained_w2v.wv)
w2v_test=w2v_of_sentences(x_test, trained_w2v.wv)
print("w2v_train : ", w2v_train.shape)
print("w2v_test : ", w2v_test.shape)

**Logistic Regression**

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns

lr_clf_trained_w2v = LogisticRegression(max_iter=100, solver='liblinear')
lr_clf_trained_w2v = lr_clf_trained_w2v.fit(w2v_train, y_train)
y_pred_lr_clf_trained_w2v = lr_clf_trained_w2v.predict(w2v_test)

print("Accuracy:",metrics.accuracy_score(y_test,y_pred_lr_clf_trained_w2v))
print("precision:",metrics.precision_score(y_test,y_pred_lr_clf_trained_w2v))
print("recall:",metrics.recall_score(y_test,y_pred_lr_clf_trained_w2v))
print("Confusion Matrix")
sns.heatmap(metrics.confusion_matrix(y_test,y_pred_lr_clf_trained_w2v),annot=True)

**Support Vector Machine**

In [41]:
# from sklearn.naive_bayes import RandomForestClassifier
# from sklearn import metrics
# import seaborn as sns

# rf_clf_trained_w2v = RandomForestClassifier(n_estimators = 100)
# rf_clf_trained_w2v = rf_clf_trained_w2v.fit(w2v_train, y_train)
# y_pred = rf_clf_trained_w2v.predict(w2v_test)

# print("Accuracy:",metrics.accuracy_score(y_test,y_pred))
# print("precision:",metrics.precision_score(y_test,y_pred))
# print("recall:",metrics.recall_score(y_test,y_pred))
# print("Confusion Matrix")
# sns.heatmap(metrics.confusion_matrix(y_test,y_pred),annot=True)

# from sklearn import svm
# from sklearn import metrics
# import seaborn as sns

# np.random.seed(0)
# svm_clf_trained_w2v = svm.SVC(C=1.0, kernel='linear')
# svm_clf_trained_w2v = svm_clf_trained_w2v.fit(w2v_train, y_train)
# y_pred_svm_clf_trained_w2v = svm_clf_trained_w2v.predict(w2v_test)

# print("Accuracy:",metrics.accuracy_score(y_test,y_pred_svm_clf_trained_w2v))
# print("precision:",metrics.precision_score(y_test,y_pred_svm_clf_trained_w2v))
# print("recall:",metrics.recall_score(y_test,y_pred_svm_clf_trained_w2v))
# print("Confusion Matrix")
# sns.heatmap(metrics.confusion_matrix(y_test,y_pred_svm_clf_trained_w2v),annot=True)


**Calculating False Positive Rate(fpr), True Positive Rate(tpr) , threshold for different models using roc curve**

In [42]:
from sklearn.metrics import roc_curve

fpr1, tpr1, thresh1 = roc_curve(y_test, y_pred_lr_clf_trained_w2v, pos_label=1)
# fpr2, tpr2, thresh2 = roc_curve(y_test, y_pred_svm_clf_trained_w2v, pos_label=1)

random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

**Calculating ROC scores to all models**

In [43]:
from sklearn.metrics import roc_auc_score
Roc=[]
Roc.append(roc_auc_score(y_test,y_pred_lr_clf_trained_w2v ))
#Roc.append(roc_auc_score(y_test, y_pred_svm_clf_trained_w2v))

print(*Roc)

**Plotting all the ROC curves**

In [44]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')

plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
# plt.plot(fpr2, tpr2, linestyle='--',color='green', label='Support Vector Machine')

plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')

plt.title('ROC curve')

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show();

**Predicting best model based on roc score**

In [45]:
if Roc.index(max(Roc))==0:
    print("LogisticRegression performs best")
else:
    print("SVM performs best")

**Vectorization with pre-trained word embeddings (GloVe)**

There are models already trained on tweets which are available through this link https://nlp.stanford.edu/data/glove.twitter.27B.zip. These models have been trained on a vocabulary of 1.2 million tokens.

After having decompressed the archive, I will test the model whose vectors have a size of 50.

I load the model with Gensim's load_word2vec_format function.

**Loading word embeddings with Gensim**

In [46]:
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip glove.twitter.27B.zip

In [47]:
pretrained_glove_w2v = gensim.models.KeyedVectors.load_word2vec_format('glove.twitter.27B.50d.txt', binary=False, no_header=True)

In [48]:
pretrained_glove_w2v_train=w2v_of_sentences(x_train, pretrained_glove_w2v)
pretrained_glove_w2v_test=w2v_of_sentences(x_test, pretrained_glove_w2v)
print("w2v_train : ", pretrained_glove_w2v_train.shape)
print("w2v_test : ", pretrained_glove_w2v_test.shape)

**Logistic Regression**

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns

lr_clf_pretrained_glove_w2v = LogisticRegression(max_iter=100, solver='liblinear')
lr_clf_pretrained_glove_w2v = lr_clf_pretrained_glove_w2v.fit(pretrained_glove_w2v_train, y_train)
y_pred_lr_clf_pretrained_glove_w2v = lr_clf_pretrained_glove_w2v.predict(pretrained_glove_w2v_test)

print("Accuracy:",metrics.accuracy_score(y_test,y_pred_lr_clf_pretrained_glove_w2v))
print("precision:",metrics.precision_score(y_test,y_pred_lr_clf_pretrained_glove_w2v))
print("recall:",metrics.recall_score(y_test,y_pred_lr_clf_pretrained_glove_w2v))
print("Confusion Matrix")
sns.heatmap(metrics.confusion_matrix(y_test,y_pred_lr_clf_pretrained_glove_w2v),annot=True)

**Support Vector Machine**

In [50]:
# from sklearn.ensemble import RandomForestClassifier
# import seaborn as sns

# rf_clf_pretrained_glove_w2v = RandomForestClassifier(n_estimators = 100)
# rf_clf_pretrained_glove_w2v = rf_clf_pretrained_glove_w2v.fit(pretrained_glove_w2v_train, y_train)
# y_pred = rf_clf_pretrained_glove_w2v.predict(pretrained_glove_w2v_test)

# print("Accuracy:",metrics.accuracy_score(y_test,y_pred))
# print("precision:",metrics.precision_score(y_test,y_pred))
# print("recall:",metrics.recall_score(y_test,y_pred))
# print("Confusion Matrix")
# sns.heatmap(metrics.confusion_matrix(y_test,y_pred),annot=True)


# from sklearn import svm
# from sklearn import metrics
# import seaborn as sns

# np.random.seed(0)
# svm_clf_pretrained_glove_w2v = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# svm_clf_pretrained_glove_w2v = svm_clf_pretrained_glove_w2v.fit(pretrained_glove_w2v_train, y_train)
# y_pred_svm_clf_pretrained_glove_w2v= svm_clf_pretrained_glove_w2v.predict(pretrained_glove_w2v_test)

# print("Accuracy:",metrics.accuracy_score(y_test,y_pred_svm_clf_pretrained_glove_w2v))
# print("precision:",metrics.precision_score(y_test,y_pred_svm_clf_pretrained_glove_w2v))
# print("recall:",metrics.recall_score(y_test,y_pred_svm_clf_pretrained_glove_w2v))
# print("Confusion Matrix")
# sns.heatmap(metrics.confusion_matrix(y_test,y_pred_svm_clf_pretrained_glove_w2v),annot=True)

**Calculating False Positive Rate(fpr), True Positive Rate(tpr) , threshold for different models using roc curve**

In [51]:
from sklearn.metrics import roc_curve

fpr1, tpr1, thresh1 = roc_curve(y_test, y_pred_lr_clf_pretrained_glove_w2v, pos_label=1)
#fpr2, tpr2, thresh2 = roc_curve(y_test, y_pred_svm_clf_pretrained_glove_w2v, pos_label=1)

random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

**Calculating ROC scores to all models**

In [52]:
from sklearn.metrics import roc_auc_score
Roc=[]
Roc.append(roc_auc_score(y_test,y_pred_lr_clf_pretrained_glove_w2v ))
#Roc.append(roc_auc_score(y_test, y_pred_svm_clf_pretrained_glove_w2v))

print(*Roc)

**Plotting all the ROC curves**

In [53]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')

plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
#plt.plot(fpr2, tpr2, linestyle='--',color='green', label='Support Vector Machine')

plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')

plt.title('ROC curve')

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show();

**Predicting best model based on roc score**

In [54]:
if Roc.index(max(Roc))==0:
    print("LogisticRegression performs best")
else:
    print("SVM performs best")

In [55]:
print("success")

**Bar-Graph**

In [56]:
import matplotlib.pyplot as plt
from sklearn import metrics
x=["CountVec","Tfidf","Word2Vec","Glove"]
h=[metrics.accuracy_score(y_test,y_pred_lr_clf_countVec),metrics.accuracy_score(y_test,y_pred_nb_clf_tfidf),metrics.accuracy_score(y_test,y_pred_lr_clf_trained_w2v),metrics.accuracy_score(y_test,y_pred_lr_clf_pretrained_glove_w2v)]
c=["red","green","blue","yellow"]
plt.bar(x,h,width=0.5,color=c)
plt.xlabel("Different Vectorization Tecniques")
plt.ylabel("Accuracy attained by Logistic Regression")
plt.title("Finding best tecnique with best accuracy")
plt.show()

In [57]:
print(x[h.index(max(h))]+"is predicted as the best technique, provided logistic regression As the common predictive model among all techniques")

In [58]:
!pip install joblib
import joblib

In [66]:
model=LogisticRegression(solver="liblinear")
model.fit(countVec_train, y_train)

# from sklearn.linear_model import LogisticRegression
# import seaborn as sns
# from sklearn import metrics

# lr_clf_countVec = LogisticRegression(max_iter=100, solver='liblinear')
# lr_clf_countVec = lr_clf_countVec.fit(countVec_train, y_train)
# y_pred_lr_clf_countVec = lr_clf_countVec.predict(countVec_test)

# print("Accuracy:",metrics.accuracy_score(y_test,y_pred_lr_clf_countVec))
# print("precision:",metrics.precision_score(y_test,y_pred_lr_clf_countVec))
# print("recall:",metrics.recall_score(y_test,y_pred_lr_clf_countVec))
# print("Confusion Matrix")
# sns.heatmap(metrics.confusion_matrix(y_test,y_pred_lr_clf_countVec),annot=True)

In [68]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer=CountVectorizer()

countVec_train=count_vectorizer.fit_transform(x_train) 
countVec_test=count_vectorizer.transform(x_test)
print("count_train : ", countVec_train.shape)
print("count_test : ", countVec_test.shape)

In [70]:
en_stopwords=set(stopwords.words("english"))
joblib.dump(en_stopwords,"stopwords.pkl") 
joblib.dump(model,"model.pkl")
joblib.dump(count_vectorizer,"vectorizer.pkl")