In [None]:
import csv
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import words
from textblob import TextBlob
from textblob import Word
from matplotlib import pyplot as plt
import random
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk

In [None]:
#Enter pathname to dataset
df = pd.read_csv("pathname", header=None)

In [None]:
df.head()

## Data Exploration

In [None]:
df.columns = ['target','id','datetime','query','user','tweet']

In [None]:
#flag column contains only of no query - this column is not value adding
df.groupby('query').query.unique()

In [None]:
#drop irrelevant columns
df = df.drop(['datetime','query','user'], axis=1)

In [None]:
#no missing values present
df.isnull().sum()

In [None]:
df.count()

In [None]:
df[df['target']==0].count()

In [None]:
df[df['target']==4].count()

## Data Sampling: 1.6 million rows takes too long processing time. Sample 1% of the dataset

In [None]:
#get tweets of negative polarity
neg_df = df[df['target'] == 0]

#get tweets of positive polarity
pos_df = df[df['target'] == 4]

In [None]:
neg_subset = neg_df.sample(frac = 0.001, random_state = 55)

pos_subset = pos_df.sample(frac = 0.001, random_state = 55)

In [None]:
df_subset = pd.concat([neg_subset, pos_subset])

In [None]:
#create a copy of original df to retain original data
test_df = df_subset.copy(deep=False)

In [None]:
test_df.head(10)

### Data preprocessing

#### Case Folding

In [None]:
#apply case folding
test_df ['clean_tweet'] = test_df['tweet'].str.lower()

#### Removing Elements

In [None]:
#remove unwanted elements
def remove_ele(input_str, target_element):
    r = re.findall(target_element, input_str)
    
    for i in r:
        input_str = re.sub(i, '', input_str)
    
    return input_str

In [None]:
#unwanted elements: @user, punctuations, numbers, hashtags

#removing @user
test_df['clean_tweet'] = test_df.apply(lambda row: remove_ele(row['clean_tweet'], "@[\w]*"),axis=1)
#removing punctuations, numbers, hashtags (non-letter characters)
test_df['clean_tweet'] = test_df['clean_tweet'].str.replace("[^a-zA-Z\s_]", "")

#### Spelling Correction

In [None]:
from autocorrect import Speller

spell = Speller(lang='en')

def spellingCorrection(input_str):
    
    textCorrected = spell(input_str)
    
    return textCorrected

In [None]:
test_df['clean_tweet'] = test_df.apply(lambda row: spellingCorrection(row['clean_tweet']),axis=1)


#### Negation Handling

In [None]:
def NegationHandling(input_str):
    
    syntacticNegation = {"no","not","rather","couldnt","wasnt","didnt","wouldnt","shouldnt",
                   "werent","dont","doesnt","havent","hasnt","wont","hadnt",
                    "never","none","nobody","nothing","neither","nor","nowhere","isnt"
                         ,"cant","cannot","musnt","mightnt","shant","without","neednt"}
    
    split_str = input_str.split()
    num = 0
    limit = len(split_str)-1
    syneg = False

    while syneg == False and num < limit:
        if split_str[num] in syntacticNegation:
            split_str[num] += "_NEG"
            syneg = True
        else:
            num += 1
            syneg = False
            continue

        while syneg == True and num < limit:
            num += 1

            if split_str[num].lower() == 'but':
                syneg == False
                break

            elif split_str[num][-1] in string.punctuation:
                split_str[num] += "_NEG"
                syneg == False
                break

            else:
                split_str[num] += "_NEG"
                continue

        if num < limit:
            syneg = False

    out = " ".join([x for x in split_str])
    return out

In [None]:
#implement negation handling
test_df['clean_tweet'] = test_df.apply(lambda row: NegationHandling(row['clean_tweet']),axis=1)

#### Duplicated Word Normalisation

In [None]:
#remove duplicated words
def duplicateNormalisation(input_str):

    regex_ex = re.compile(r'([^\W\d_])\1{2,}')
    tweet = re.sub(r'([^\W\d_])\1{2,}', r'\1\1', input_str)
    
    return tweet

In [None]:
test_df['clean_tweet'] = test_df.apply(lambda row: duplicateNormalisation(row['clean_tweet']),axis=1)

In [None]:
test_df.head()

In [None]:
test_df['tweet'].iloc[3]

In [None]:
test_df['clean_tweet'].iloc[3]

## Feature Extraction

### N-Grams Analysis

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
stopwords_set = set(stopwords.words('english'))
stopwords_set.add('')
stopwords_set.add(' ')

In [None]:
def generate_N_grams(text,ngram=1):
    words=[word for word in text.split(" ") if word not in stopwords_set]  
    temp=zip(*[words[i:] for i in range(0,ngram)])
    ans=[' '.join(ngram) for ngram in temp]
    return ans

In [None]:
test_df.head()

#### UNIGRAMS

In [None]:
from collections import defaultdict

positiveValues1=defaultdict(int)
negativeValues1=defaultdict(int)

for text in test_df[test_df.target==4].tweet:
    for word in generate_N_grams(text,1):
        positiveValues1[word]+=1

for text in test_df[test_df.target==0].tweet:
    for word in generate_N_grams(text,1):
        negativeValues1[word]+=1

df_positive1=pd.DataFrame(sorted(positiveValues1.items(),key=lambda x:x[1],reverse=True))
df_negative1=pd.DataFrame(sorted(negativeValues1.items(),key=lambda x:x[1],reverse=True))

pd1uni=df_positive1[0][:10]
pd2uni=df_positive1[1][:10]

ned1uni=df_negative1[0][:10]
ned2uni=df_negative1[1][:10]


In [None]:
plt.figure(1,figsize=(16,4))
plt.bar(pd1uni,pd2uni, color ='green',
        width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in positive dataframe-UNIGRAM ANALYSIS")
plt.savefig('unigram_pos.png')
plt.show()

In [None]:
plt.figure(1,figsize=(16,4))
plt.bar(ned1uni,ned2uni, color ='orange',
        width = 0.4)
plt.xlabel("Words in negative dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in negative dataframe-UNIGRAM ANALYSIS")
plt.savefig('unigram_neg.png')
plt.show()

#### BIGRAMS

In [None]:
from collections import defaultdict

positiveValues2=defaultdict(int)
negativeValues2=defaultdict(int)

for text in test_df[test_df.target==4].tweet:
    for word in generate_N_grams(text,2):
        positiveValues2[word]+=1

for text in test_df[test_df.target==0].tweet:
    for word in generate_N_grams(text,2):
        negativeValues2[word]+=1

df_positive2=pd.DataFrame(sorted(positiveValues2.items(),key=lambda x:x[1],reverse=True))
df_negative2=pd.DataFrame(sorted(negativeValues2.items(),key=lambda x:x[1],reverse=True))

pd1bi=df_positive2[0][:10]
pd2bi=df_positive2[1][:10]

ned1bi=df_negative2[0][:10]
ned2bi=df_negative2[1][:10]


In [None]:
plt.figure(1,figsize=(16,4))
plt.bar(pd1bi,pd2bi, color ='green',
        width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in positive dataframe-BIGRAM ANALYSIS")
plt.savefig('bigram_pos.png')
plt.show()

In [None]:
plt.figure(1,figsize=(16,4))
plt.bar(ned1bi,ned2bi, color ='orange',
        width = 0.4)
plt.xlabel("Words in negative dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in negative dataframe-BIGRAM ANALYSIS")
plt.savefig('bigram_neg.png')
plt.show()

#### Trigrams

In [None]:
from collections import defaultdict

positiveValues3=defaultdict(int)
negativeValues3=defaultdict(int)

for text in test_df[test_df.target==4].tweet:
    for word in generate_N_grams(text,3):
        positiveValues3[word]+=1

for text in test_df[test_df.target==0].tweet:
    for word in generate_N_grams(text,3):
        negativeValues3[word]+=1
        
df_positive3=pd.DataFrame(sorted(positiveValues3.items(),key=lambda x:x[1],reverse=True))
df_negative3=pd.DataFrame(sorted(negativeValues3.items(),key=lambda x:x[1],reverse=True))


pd1tri=df_positive3[0][0:10]
pd2tri=df_positive3[1][0:10]

ned1tri=df_negative3[0][0:10]
ned2tri=df_negative3[1][0:10]

In [None]:
plt.figure(1,figsize=(30,4))
plt.bar(pd1tri,pd2tri, color ='green',
        width = 0.4)
plt.xlabel("Positive Trigram Words")
plt.ylabel("Count")
plt.title("Top 10 words in positive dataframe-TRIGRAM ANALYSIS")
plt.savefig('trigram_pos.png')
plt.show()

In [None]:
plt.figure(1,figsize=(16,4))
plt.bar(ned1tri,ned2tri, color ='orange',
        width = 0.4)
plt.xlabel("Negative Trigram Words")
plt.ylabel("Count")
plt.title("Top 10 words in negative dataframe-TRIGRAM ANALYSIS")
plt.savefig('trigram_neg.png')
plt.show()

## Data Partition (80:20)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(test_df['clean_tweet'], test_df['target'], test_size = 0.2, random_state = 50)

### N-Grams Vectorisation

In [None]:
#unigram vectorisation
from sklearn.feature_extraction.text import CountVectorizer
ngrams_vect = CountVectorizer(ngram_range = (1,1)).fit(X_train)
X_train_ngrams_vect_uni = ngrams_vect.transform(X_train)

In [None]:
#uni-bi-trigram vectorisation
from sklearn.feature_extraction.text import CountVectorizer
ngrams_vect = CountVectorizer(ngram_range = (1,3)).fit(X_train)
X_train_ngrams_vect_unitri = ngrams_vect.transform(X_train)

In [None]:
len(ngrams_vect.get_feature_names_out()) #unigrams - 20455, #bigrams - 85908, #trigrams - 114527, #uni, bi, trigrams - 220890

### TF-IDF Vectorisation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(min_df = 1).fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)

In [None]:
len(tfidf_vect.get_feature_names_out())

## Sentiment Classification

### Model A: SVM with N-Grams

In [None]:
from sklearn.svm import SVC
modelA = SVC()

# fit on n-grams vectorised
modelA.fit(X_train_ngrams_vect_uni, y_train)

predictionsA = modelA.predict(ngrams_vect.transform(X_test))

In [None]:
modelA._gamma

In [None]:
print(predictionsA)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictionsA))

In [None]:
#compute AUC
from sklearn.metrics import roc_auc_score

print("AUC: ", roc_auc_score(y_test, predictionsA))

### Model B: SVM with TF-IDF

In [None]:
from sklearn.svm import SVC
modelB = SVC()

modelB.fit(X_train_tfidf_vect, y_train)

predictionsB = modelB.predict(tfidf_vect.transform(X_test))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictionsB))

In [None]:
print("AUC: ", roc_auc_score(y_test, predictionsB)) #75.67%

### Model C: Naive Bayes with N-Grams

In [None]:
from sklearn.naive_bayes import MultinomialNB
modelC = MultinomialNB()

modelC.fit(X_train_ngrams_vect_uni,y_train)

predictionsC = modelC.predict(ngrams_vect.transform(X_test))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictionsC))

In [None]:
print("AUC: ", roc_auc_score(y_test, predictionsC)) #73.89%

### Model D: Naive Bayes with TF-IDF

In [None]:
from sklearn.naive_bayes import MultinomialNB
modelD = MultinomialNB()

modelD.fit(X_train_tfidf_vect,y_train)
predictionsD = modelD.predict(tfidf_vect.transform(X_test))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictionsD))

In [None]:
print("AUC: ", roc_auc_score(y_test, predictionsD)) #73.4%

### Model E and F: N-Grams + TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(ngram_range=(1,1), min_df=1).fit(X_train)
X_train_tfidf_vect_E = tfidf_vect.transform(X_train)

In [None]:
len(tfidf_vect.get_feature_names_out())

In [None]:
#svm with n-grams and tfidf
modelE = SVC()
modelE.fit(X_train_tfidf_vect_3a, y_train)
predictionsE = modelE.predict(tfidf_vect.transform(X_test))

print(classification_report(y_test, predictionsE))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(ngram_range=(1,3), min_df=1).fit(X_train)
X_train_tfidf_vect_F = tfidf_vect.transform(X_train)

In [None]:
#nb with n-grams and tfidf
modelF = MultinomialNB()
modelF.fit(X_train_tfidf_vect_2f,y_train)
predictionsF = model.predict(tfidf_vect.transform(X_test))

print(classification_report(y_test, predictionsF))

In [None]:
print("AUC: ", roc_auc_score(y_test, predictionsD))
print("AUC: ", roc_auc_score(y_test, predictionsE))

## Optimising with Information Gain

### Information Gain
#### Feature Selection using Mutual Information

In [None]:
#calculate information gain value for terms in X_train_ngrams_vect
from sklearn.feature_selection import mutual_info_classif

res_ngrams = dict(zip(ngrams_vect.get_feature_names_out(),
              mutual_info_classif(X_train_ngrams_vect_uni,y_train,discrete_features=True)))

print(res_ngrams)

In [None]:
#calculate information gain value for terms in X_train_tfidf_vect
from sklearn.feature_selection import mutual_info_classif

res_tfidf = dict(zip(tfidf_vect.get_feature_names_out(),
              mutual_info_classif(X_train_tfidf_vect,y_train,discrete_features=True)))

print(res_tfidf)

In [None]:
#calculate information gain value for terms in X_train_tfidf_vect
from sklearn.feature_selection import mutual_info_classif

res_tfidf = dict(zip(tfidf_vect.get_feature_names_out(),
              mutual_info_classif(X_train_tfidf_vect_E,y_train,discrete_features=True)))

print(res_tfidf)

In [None]:
ngrams_items = res_ngrams.items()
ngrams_list = list(ngrams_items)

df_ngrams = pd.DataFrame(ngrams_list)
df_ngrams = df_ngrams.sort_values(1, ascending=False)

In [None]:
tfidf_items = res_tfidf.items()
tfidf_list = list(tfidf_items)

df_tfidf = pd.DataFrame(tfidf_list)
df_tfidf = df_tfidf.sort_values(1, ascending=False)

In [None]:
df_ngrams.head(10)

In [None]:
df_tfidf.head(10)

In [None]:
#Get threshold value to retain top 20%
def ig_threshold(vectoriser):
    if vectoriser == 'n-grams':
        df = df_ngrams
    else:
        df = df_tfidf
        
    top20_count = 0.2*(df.count())
    top20_count = int(top20_count[0])
    top20_threshold = df[1].iloc[top20_count]
    return(top20_threshold)

In [None]:
#remove words that are have information gain lower than top20_threshold
def ig_selection(input_str,top20_threshold, res):
    
    for word in input_str.split(" "):
        ig_val = res.get(word)
        if ig_val:
            if res[word] < top20_threshold:
                input_str = input_str.replace(word, '')
    return input_str

In [None]:
print(ig_threshold('n-grams'))

In [None]:
print(ig_threshold('tfidf'))

In [None]:
#filter according to ig_ngrams
top20_threshold = ig_threshold('n-grams')

test_df['ngrams_ig_tweet'] = test_df.apply(lambda row: 
                                               ig_selection(row['clean_tweet'],top20_threshold, res_ngrams),axis=1)

In [None]:
#filter according to ig_tfidf
top20_threshold = ig_threshold('tf-idf')

test_df['tfidf_ig_tweet'] = test_df.apply(lambda row: 
                                                ig_selection(row['clean_tweet'],top20_threshold, res_tfidf),axis=1)

In [None]:
#Repeat vectorisation again with n-grams and tf-idf and apply SVM and NB
X_train, X_test, y_train, y_test = train_test_split(test_df['ngrams_ig_tweet'], test_df['target'], test_size = 0.2, random_state = 50)

#tfidf
tfidf_vect = TfidfVectorizer().fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)

#uni-grams
ngrams_vect = CountVectorizer(ngram_range = (1,1)).fit(X_train)
X_train_ngrams_vect = ngrams_vect.transform(X_train)

#svm with n-grams
modelF = SVC()
modelF.fit(X_train_ngrams_vect, y_train)
predictionsF = modelF.predict(ngrams_vect.transform(X_test))

print(classification_report(y_test, predictionsF))

#svm with tfidf
modelG = SVC()
modelG.fit(X_train_tfidf_vect, y_train)
predictionsG = modelG.predict(tfidf_vect.transform(X_test))

print(classification_report(y_test, predictionsG))

#uni-tri-grams
ngrams_vect = CountVectorizer(ngram_range = (1,3)).fit(X_train)
X_train_ngrams_vect = ngrams_vect.transform(X_train)


#nb with n-grams
modelH = MultinomialNB()
modelH.fit(X_train_ngrams_vect,y_train)
predictionsH = modelH.predict(ngrams_vect.transform(X_test))

print(classification_report(y_test, predictionsH))

#nb with tfidf
modelI = MultinomialNB()
modelI.fit(X_train_tfidf_vect,y_train)
predictionsI = modelI.predict(tfidf_vect.transform(X_test))

print(classification_report(y_test, predictionsI))

In [None]:
print("AUC: ", roc_auc_score(y_test, predictionsF))
print("AUC: ", roc_auc_score(y_test, predictionsG))
print("AUC: ", roc_auc_score(y_test, predictionsH))
print("AUC: ", roc_auc_score(y_test, predictionsI))

In [None]:
#SVM/NB + ngrams + tfidf + information gain
X_train, X_test, y_train, y_test = train_test_split(test_df['tfidf_ig_tweet'], test_df['target'], test_size = 0.2, random_state = 50)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=1).fit(X_train)
X_train_tfidf_vect_F = tfidf_vect.transform(X_train)

#svm with tfidf
modelL = SVC()
modelL.fit(X_train_tfidf_vect_F, y_train)
predictionsL = modelL.predict(tfidf_vect.transform(X_test))

print(classification_report(y_test, predictionsL))

#nb with tfidf
modelM = MultinomialNB()
modelM.fit(X_train_tfidf_vect_F,y_train)
predictionsM = modelM.predict(tfidf_vect.transform(X_test))

print(classification_report(y_test, predictionsM))

print("AUC: ", roc_auc_score(y_test, predictionsL))
print("AUC: ", roc_auc_score(y_test, predictionsM))

In [None]:
#Repeat vectorisation again with n-grams and tf-idf and apply SVM and NB
X_train, X_test, y_train, y_test = train_test_split(test_df['tfidf_ig_tweet'], test_df['target'], test_size = 0.2, random_state = 50)

#n-grams
ngrams_vect = CountVectorizer(ngram_range = (1,2)).fit(X_train)
X_train_ngrams_vect = ngrams_vect.transform(X_train)

#tfidf
tfidf_vect = TfidfVectorizer().fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)

#svm with n-grams
modelE = SVC()
modelE.fit(X_train_ngrams_vect, y_train)
predictionsE = modelE.predict(ngrams_vect.transform(X_test))

print(classification_report(y_test, predictionsE))

#svm with tfidf
modelF = SVC()
modelF.fit(X_train_tfidf_vect, y_train)
predictionsF = modelF.predict(tfidf_vect.transform(X_test))

print(classification_report(y_test, predictionsF))

#nb with n-grams
modelG = MultinomialNB()
modelG.fit(X_train_ngrams_vect,y_train)
predictionsG = modelG.predict(ngrams_vect.transform(X_test))

print(classification_report(y_test, predictionsG))

#nb with tfidf
modelH = MultinomialNB()
modelH.fit(X_train_tfidf_vect,y_train)
predictionsH = modelH.predict(tfidf_vect.transform(X_test))

print(classification_report(y_test, predictionsH))


In [None]:
print("AUC: ", roc_auc_score(y_test, predictionsE))
print("AUC: ", roc_auc_score(y_test, predictionsF))
print("AUC: ", roc_auc_score(y_test, predictionsG))
print("AUC: ", roc_auc_score(y_test, predictionsH))

#### Hyperparameter Tuning

SVM + N-Grams + HT

In [None]:
X_train, X_test, y_train, y_test = train_test_split(test_df['clean_tweet'], test_df['target'], test_size = 0.2, random_state = 50)

#n-grams
ngrams_vect = CountVectorizer(ngram_range = (1,1)).fit(X_train)
X_train_ngrams_vect = ngrams_vect.transform(X_train)

#tfidf
tfidf_vect = TfidfVectorizer().fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C':[0.1, 1, 10, 100, 1000],
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
             'kernel':['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)

grid.fit(X_train_ngrams_vect, y_train)

In [None]:
print(grid.best_params_)

In [None]:
print(grid.best_estimator_)

In [None]:
grid_predictions = grid.predict(ngrams_vect.transform(X_test))

In [None]:
print(classification_report(y_test, grid_predictions))

In [None]:
print("AUC: ", roc_auc_score(y_test, grid_predictions))

SVM + TF-IDF + HT

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C':[0.1, 1, 10, 100, 1000],
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
             'kernel':['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)

grid.fit(X_train_tfidf_vect, y_train)

In [None]:
print(grid.best_params_)

In [None]:
print(grid.best_estimator_)

In [None]:
grid_predictions = grid.predict(tfidf_vect.transform(X_test))

In [None]:
print(classification_report(y_test, grid_predictions))

In [None]:
print("AUC: ", roc_auc_score(y_test, grid_predictions))

#### Optimisation by combining NGrams+TFIDF and Hyperparameter Tuning

Perform hyperparameter tuning for SVM + N-Grams + TFIDF

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C':[0.1, 1, 10, 100, 1000],
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
             'kernel':['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)

grid.fit(X_train_tfidf_vect_E, y_train)

grid_predictions = grid.predict(tfidf_vect.transform(X_test))

In [None]:
print(grid.best_params_)

In [None]:
print(classification_report(y_test, grid_predictions))

In [None]:
print("AUC: ", roc_auc_score(y_test, grid_predictions))