### SUBMITTED BY : AKANSHA RAJ (D22003)

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
import warnings
warnings.filterwarnings("ignore")
import nltk
from gensim.models import KeyedVectors

### Tweet Data

In [3]:
# Reading the csv file into pandas dataframe
tweets = pd.read_csv(r"C:\Users\Akansha Raj\Downloads\tweets.csv")

In [4]:
# Checking the length of the dataframe
len(tweets)

1181

In [5]:
# Removing the reviews with Average rating zero since they make no sense
tweets = tweets[tweets.Avg!=0]

In [6]:
# Checking the length of the dataframe
len(tweets)

844

In [7]:
# Resetting the indexes of the dataframe and dropping the original index
tweets.reset_index(drop=True,inplace=True)

In [None]:
tweets.Avg = tweets.Avg.apply(lambda x: -1 if x<0 else 1)

In [8]:
# Converting the Average rating(target) column into binary classes
for i in range(len(tweets)):
    if tweets.Avg[i] <0 :
        tweets.Avg[i] = -1
    else:
        tweets.Avg[i] = 1

In [None]:
a,b = 5,9

In [9]:
# Checking the first and last 5 rows of the data-frame to get an overall feel of the data
tweets.head(), tweets.tail()

(                                               Tweet  Avg
 0  I have to say, Apple has by far the best custo...  1.0
 1  iOS 7 is so fricking smooth & beautiful!! #Tha...  1.0
 2                                      LOVE U @APPLE  1.0
 3  Thank you @apple, loving my new iPhone 5S!!!!!...  1.0
 4  .@apple has the best customer service. In and ...  1.0,
                                                  Tweet  Avg
 839                                       freak @apple -1.0
 840  WHY CANT I freakING SEE PICTURES ON MY TL IM A... -1.0
 841                 @APPLE YOU freakING COWS freak YOU -1.0
 842  @apple I hate you why is my phone not working ... -1.0
 843  @aGounalakis that's nasty! @apple is a nasty brat -1.0)

### Test Pre-processing

#### 1. Converting the text into lower cases

In [36]:
#Convert a string into lower case
twt = tweets.Tweet[1]
twt.lower()

'ios 7 is so fricking smooth & beautiful!! #thanxapple @apple'

In [37]:
#normalizing all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = tweets.Tweet[i].lower()

In [38]:
tweets.head()

Unnamed: 0,Tweet,Avg
0,"i have to say, apple has by far the best custo...",1.0
1,ios 7 is so fricking smooth & beautiful!! #tha...,1.0
2,love u @apple,1.0
3,"thank you @apple, loving my new iphone 5s!!!!!...",1.0
4,.@apple has the best customer service. in and ...,1.0


#### 2. Remove Punctuations/special symbols and 

In [39]:
#Removing punctuation from a single tweet
import string
p = string.punctuation
remv_punc = str.maketrans("", "", p)
twt.translate(remv_punc)

'iOS 7 is so fricking smooth  beautiful ThanxApple Apple'

In [40]:
#Removing punctuation from all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = tweets.Tweet[i].translate(remv_punc)
tweets.head()

Unnamed: 0,Tweet,Avg
0,i have to say apple has by far the best custom...,1.0
1,ios 7 is so fricking smooth beautiful thanxap...,1.0
2,love u apple,1.0
3,thank you apple loving my new iphone 5s apple...,1.0
4,apple has the best customer service in and out...,1.0


#### 3. Remove stopwords (and the word "apple")

In [41]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aswin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
#NLTK stopword list
stop_words = stopwords.words("english")
len(stop_words)

179

In [43]:
#Adding the word apple to the list of stopwords
stop_words.append("apple")
len(stop_words)

180

In [44]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [45]:
#Remove stop words from a single tweet
" ".join([w for w in tweets.Tweet[0].split() if w not in stop_words])

'say far best customer care service ever received appstore'

In [46]:
#Removing stopwords from all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = " ".join([w for w in tweets.Tweet[i].split() if w not in stop_words])
    
tweets.head()

Unnamed: 0,Tweet,Avg
0,say far best customer care service ever receiv...,1.0
1,ios 7 fricking smooth beautiful thanxapple,1.0
2,love u,1.0
3,thank loving new iphone 5s iphone5s pictwitter...,1.0
4,best customer service new phone 10min,1.0


#### 4. Remove white spaces

In [47]:
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = tweets.Tweet[i].replace("  ", " ").strip()
tweets.head()

Unnamed: 0,Tweet,Avg
0,say far best customer care service ever receiv...,1.0
1,ios 7 fricking smooth beautiful thanxapple,1.0
2,love u,1.0
3,thank loving new iphone 5s iphone5s pictwitter...,1.0
4,best customer service new phone 10min,1.0


#### Embeddings

In [59]:
embeddings = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)

In [60]:
def vec(a):
    return(embeddings[a])

In [91]:
len(tweets)

844

In [116]:
# Converting each document into a vector
dict_1 = {}
index_list = []
for tweet,index in zip(tweets.Tweet,tweets.index):
    rev_list = []
    for word in tweet.split():
        if word in embeddings.index_to_key:
            index_list.append(index)
            rev_list.append(vec(word))
    dict_1[tweet] = np.sum(np.array(rev_list),axis=0)
index_set = set(index_list)

In [118]:
#len(index_set)

837

In [111]:
document_matrix = pd.DataFrame(dict_1).T
document_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
say far best customer care service ever received appstore,-0.493164,-0.095337,0.296143,0.540649,-0.170532,0.215515,0.673096,-0.081055,1.091064,0.981018,...,-0.279327,0.084229,-1.071533,0.579533,0.664551,0.483276,-0.215881,-0.071289,0.313599,-0.570312
ios 7 fricking smooth beautiful thanxapple,0.261780,-0.117188,0.264160,-0.001953,-0.431274,0.064209,0.576843,-0.428833,0.376465,0.780273,...,-0.202026,0.261108,-0.687988,-0.214279,-0.215454,0.129150,-0.666016,-0.610046,-0.109131,0.459717
love u,-0.150879,-0.105713,0.189941,0.146729,-0.448242,-0.060059,0.079102,-0.544922,-0.199707,0.302246,...,0.082275,0.326172,-0.358398,-0.585938,-0.205322,-0.469727,-0.130859,-0.369141,-0.176514,0.253418
thank loving new iphone 5s iphone5s pictwittercomxmhjcu4pcb,-0.685974,-0.154663,0.053223,0.654785,-0.353271,-0.230469,0.068115,-0.525146,-0.131836,0.474243,...,0.090332,0.766357,-0.982422,-0.664551,-0.309082,-0.283630,-0.598259,0.078613,-0.448792,0.520508
best customer service new phone 10min,-0.210693,0.010620,0.281250,0.090088,0.067139,-0.029053,-0.178711,0.016846,0.424316,0.662659,...,0.328705,0.305664,-0.824219,0.041870,0.265381,0.335083,-0.138672,0.325439,-0.018555,-0.471680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
freak u,-0.048828,0.039551,0.169617,0.110596,-0.368652,0.201172,-0.050781,-0.541016,-0.139648,-0.018066,...,0.467773,0.381836,-0.319824,-0.239258,-0.363281,-0.510742,-0.338379,-0.149780,-0.075073,0.107422
cant freaking see pictures tl im annoyed freak twitter,0.735931,0.201782,0.265518,1.291504,-1.499268,0.956970,0.374115,-1.322784,0.530029,0.329895,...,0.783691,1.047729,-0.649292,-0.248840,-1.650391,-1.156738,-0.439453,-0.505722,-0.418335,0.028564
freaking cows freak,0.586914,-0.016846,0.001648,0.423340,-0.325684,0.740234,0.166382,-0.296600,0.048584,-0.116699,...,0.114258,0.328918,-0.202637,0.109863,-0.231445,-0.142578,-0.358887,0.240845,0.085541,-0.060547
hate phone working im going freak,0.347351,0.309204,0.310913,0.426514,-0.717896,0.939087,0.136658,-0.739258,0.257324,0.149170,...,-0.155029,0.922363,-0.669189,-0.155579,-0.644287,-0.572754,-0.270996,-0.055420,0.218994,-0.158936


In [122]:
# There are duplicate reviews in my dataset
tweets.Tweet.nunique()

827

In [120]:
document_matrix=document_matrix.reset_index()

In [125]:
document_matrix['Avg']=np.nan

In [126]:
for i in range(len(tweets.Tweet)):
    for j in range(len(document_matrix)):
        if document_matrix['index'][j]==tweets['Tweet'][i]:
            document_matrix['Avg'][j]=tweets['Avg'][i]

In [133]:
document_matrix.set_index('index',inplace=True)

In [134]:
y = document_matrix.Avg

In [138]:
document_matrix.iloc[:,:-1]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
say far best customer care service ever received appstore,-0.493164,-0.095337,0.296143,0.540649,-0.170532,0.215515,0.673096,-0.081055,1.091064,0.981018,...,-0.279327,0.084229,-1.071533,0.579533,0.664551,0.483276,-0.215881,-0.071289,0.313599,-0.570312
ios 7 fricking smooth beautiful thanxapple,0.261780,-0.117188,0.264160,-0.001953,-0.431274,0.064209,0.576843,-0.428833,0.376465,0.780273,...,-0.202026,0.261108,-0.687988,-0.214279,-0.215454,0.129150,-0.666016,-0.610046,-0.109131,0.459717
love u,-0.150879,-0.105713,0.189941,0.146729,-0.448242,-0.060059,0.079102,-0.544922,-0.199707,0.302246,...,0.082275,0.326172,-0.358398,-0.585938,-0.205322,-0.469727,-0.130859,-0.369141,-0.176514,0.253418
thank loving new iphone 5s iphone5s pictwittercomxmhjcu4pcb,-0.685974,-0.154663,0.053223,0.654785,-0.353271,-0.230469,0.068115,-0.525146,-0.131836,0.474243,...,0.090332,0.766357,-0.982422,-0.664551,-0.309082,-0.283630,-0.598259,0.078613,-0.448792,0.520508
best customer service new phone 10min,-0.210693,0.010620,0.281250,0.090088,0.067139,-0.029053,-0.178711,0.016846,0.424316,0.662659,...,0.328705,0.305664,-0.824219,0.041870,0.265381,0.335083,-0.138672,0.325439,-0.018555,-0.471680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
freak u,-0.048828,0.039551,0.169617,0.110596,-0.368652,0.201172,-0.050781,-0.541016,-0.139648,-0.018066,...,0.467773,0.381836,-0.319824,-0.239258,-0.363281,-0.510742,-0.338379,-0.149780,-0.075073,0.107422
cant freaking see pictures tl im annoyed freak twitter,0.735931,0.201782,0.265518,1.291504,-1.499268,0.956970,0.374115,-1.322784,0.530029,0.329895,...,0.783691,1.047729,-0.649292,-0.248840,-1.650391,-1.156738,-0.439453,-0.505722,-0.418335,0.028564
freaking cows freak,0.586914,-0.016846,0.001648,0.423340,-0.325684,0.740234,0.166382,-0.296600,0.048584,-0.116699,...,0.114258,0.328918,-0.202637,0.109863,-0.231445,-0.142578,-0.358887,0.240845,0.085541,-0.060547
hate phone working im going freak,0.347351,0.309204,0.310913,0.426514,-0.717896,0.939087,0.136658,-0.739258,0.257324,0.149170,...,-0.155029,0.922363,-0.669189,-0.155579,-0.644287,-0.572754,-0.270996,-0.055420,0.218994,-0.158936


#### Train-test-validation split

In [151]:
# Splitting the data into train-validation-test
X_train, X_test, y_train, y_test = train_test_split(document_matrix.iloc[:,:-1], y, test_size = 0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.1)

#### 1. Using  fully grown decision tree model

In [152]:
# Instantiating the decision tree classifier model, fitting the model on training data and making predictions on validation data
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_pred_valid = dt.predict(X_valid)
fg_ac = accuracy_score(y_valid,y_pred_valid)
fg_f1 = f1_score(y_valid,y_pred_valid)
fg_auroc = roc_auc_score(y_valid,y_pred_valid)
print('The accuracy of the fully-grown decision tree is:',fg_ac)
print('The F1-score of the fully-grown decision tree is:',fg_f1)
print('The AUROC of the fully-grown decision tree is:',fg_auroc)

The accuracy of the fully-grown decision tree is: 0.7014925373134329
The F1-score of the fully-grown decision tree is: 0.6
The AUROC of the fully-grown decision tree is: 0.684593023255814


#### 2. Using pruned decision tree model

In [153]:
path = dt.cost_complexity_pruning_path(X_train,y_train)
alphas = path['ccp_alphas']

In [154]:
# Fitting the decision tree model for different values of tuning parameter and making predictions on validation data 
valid_f1_score = {}
valid_auc = {}
for i in alphas:
    dt = DecisionTreeClassifier(ccp_alpha=i)
    dt.fit(X_train,y_train)
    y_pred_valid = dt.predict(X_valid)
    valid_f1_score[i] = f1_score(y_valid,y_pred_valid)
    valid_auc[i] = roc_auc_score(y_valid,y_pred_valid)

In [155]:
# Finding the optimal value of alpha(tuning parameter) for which the modified cost is minimal (using f1_score as the performance metric)
optimal_alpha = max(valid_f1_score,key=valid_f1_score.get)
optimal_alpha

0.003250899802623938

In [156]:
# Instatiating a pruned decision tree classifier with the optimal value of alpha, fitting the model on training data and making predictions on the validation data
dt = DecisionTreeClassifier(ccp_alpha=optimal_alpha)
dt.fit(X_train,y_train)
y_pred_valid = dt.predict(X_valid)
pr_ac = accuracy_score(y_valid,y_pred_valid)
pr_f1 = f1_score(y_valid,y_pred_valid)
pr_auroc = roc_auc_score(y_valid,y_pred_valid)
print('The accuracy of the pruned decision tree with optimal ccp is:',pr_ac)
print('The F1-score of the pruned decision tree with optimal ccp is:',pr_f1)
print('The AUROC of the pruned decision tree with optimal ccp is:',pr_auroc)

The accuracy of the pruned decision tree with optimal ccp is: 0.6268656716417911
The F1-score of the pruned decision tree with optimal ccp is: 0.4897959183673469
The AUROC of the pruned decision tree with optimal ccp is: 0.5988372093023256


#### 3.Using random forest classifier model

In [157]:
# Instantiating the random forest classifier model, fitting the model on training data and making predictions on validation data
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_valid_pred_rf = rf.predict(X_valid)
rf_ac = accuracy_score(y_valid,y_valid_pred_rf)
rf_f1 = f1_score(y_valid,y_valid_pred_rf)
rf_auroc = roc_auc_score(y_valid,y_valid_pred_rf)
print('The accuracy of the Random Forest Classifier is:',rf_ac)
print('The F1-score of the Random Forest Classifier is::',rf_f1)
print('The AUROC of the Random Forest Classifier is:',rf_auroc)

The accuracy of the Random Forest Classifier is: 0.7164179104477612
The F1-score of the Random Forest Classifier is:: 0.3870967741935483
The AUROC of the Random Forest Classifier is: 0.6133720930232558


#### 4.Using bagged classifier model

In [158]:
# Instantiating the bagged classifier model, fitting the model on training data and making predictions on validation data
bcf = BaggingClassifier()
bcf.fit(X_train,y_train)
y_valid_pred_bcf = bcf.predict(X_valid)
bcf_ac = accuracy_score(y_valid,y_valid_pred_bcf)
bcf_f1 = f1_score(y_valid,y_valid_pred_bcf)
bcf_auroc = roc_auc_score(y_valid,y_valid_pred_bcf)
print('The accuracy of the Random Forest Classifier is:',bcf_ac)
print('The F1-score of the Random Forest Classifier is::',bcf_f1)
print('The AUROC of the Random Forest Classifier is:',bcf_auroc)

The accuracy of the Random Forest Classifier is: 0.7611940298507462
The F1-score of the Random Forest Classifier is:: 0.5555555555555556
The AUROC of the Random Forest Classifier is: 0.6850775193798451


#### 5.Using adaboost classifier model

In [159]:
# Instantiating the adaboost classifier model, fitting the model on training data and making predictions on validation data
abc = AdaBoostClassifier()
abc.fit(X_train,y_train)
y_valid_pred_abc = abc.predict(X_valid)
abc_ac = accuracy_score(y_valid,y_valid_pred_abc)
abc_f1 = f1_score(y_valid,y_valid_pred_abc)
abc_auroc = roc_auc_score(y_valid,y_valid_pred_abc)
print('The accuracy of the ADABoost model is:',abc_ac)
print('The F1-score of the ADABoost model is:',abc_f1)
print('The AUROC of the ADABoost model is:',abc_auroc)

The accuracy of the ADABoost model is: 0.6567164179104478
The F1-score of the ADABoost model is: 0.4102564102564102
The AUROC of the ADABoost model is: 0.5852713178294573


#### 6. Comparing the performance of the various models used

In [160]:
performance = pd.DataFrame(columns=['Standard Decision Tree','Pruned Decision Tree','Bagging Classifier','Random Forest','AdaBoost'], index = ['Accuracy','F-1 score','AUROC'])
performance

Unnamed: 0,Standard Decision Tree,Pruned Decision Tree,Bagging Classifier,Random Forest,AdaBoost
Accuracy,,,,,
F-1 score,,,,,
AUROC,,,,,


In [161]:
performance_list = np.array([[fg_ac,fg_f1,fg_auroc],[pr_ac,pr_f1,pr_auroc],[bcf_ac,bcf_f1,bcf_auroc],[rf_ac,rf_f1,rf_auroc],[abc_ac,abc_f1,abc_auroc]])
performance_list

array([[0.70149254, 0.6       , 0.68459302],
       [0.62686567, 0.48979592, 0.59883721],
       [0.76119403, 0.55555556, 0.68507752],
       [0.71641791, 0.38709677, 0.61337209],
       [0.65671642, 0.41025641, 0.58527132]])

In [162]:
for i,j in zip(list(performance.columns),np.arange(0,5)):
    performance[i] = performance_list[j]
performance

Unnamed: 0,Standard Decision Tree,Pruned Decision Tree,Bagging Classifier,Random Forest,AdaBoost
Accuracy,0.701493,0.626866,0.761194,0.716418,0.656716
F-1 score,0.6,0.489796,0.555556,0.387097,0.410256
AUROC,0.684593,0.598837,0.685078,0.613372,0.585271


### TF-IDF METHOD

#### 1. Get the Bag-Of-Words (BOW) Dataframe with TF-IDF vectorizor

In [182]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_vectors = tfidf_vectorizer.fit_transform(tweets.Tweet)

names = tfidf_vectorizer.get_feature_names()

tfidf_vectors = tfidf_vectors.toarray()
tfidf_vectors = pd.DataFrame(tfidf_vectors, columns=names)

In [183]:
tfidf_vectors

Unnamed: 0,075,10,100,1085,10min,110,12,13,13apple,16,...,yikes,yldthng,yo,yooo,youd,youre,youve,z10,zimmerman,zippos
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.558987,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
840,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
841,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
842,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 2. Dimension Reduction

In [184]:
#using count vectorizer to create a document-term matrix (to select the features)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(tweets.Tweet).toarray()

# Documnet-Term Matrix
DTM = pd.DataFrame(X, columns=cv.get_feature_names())

# Remove terms that is contained in less than 0.5% of the documents
for col in list(DTM):
    prop = DTM[col].sum()/DTM.shape[0]
    if prop*100 < 1:
        DTM = DTM.drop([col], axis=1)

print(DTM.shape)

selected_variables = list(DTM)

(844, 140)


#### 3.Train-Validation-Test Split

In [185]:
#Creating the target variable
y = tweets.Avg <= -1
tfidf_vectors = tfidf_vectors[selected_variables]

#For test-train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, y, test_size = 0.2, stratify=y)

# Train-Validation Split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, stratify=y_train)

X_train.head()

Unnamed: 0,5c,5s,already,amazon,android,anyone,app,apples,apps,back,...,well,wont,work,would,wow,wtf,yall,year,yet,youre
694,0.0,0.0,0.0,0.0,0.724737,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.431516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
486,0.0,0.17822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### i) Using  fully grown decision tree model

In [186]:
# Instantiating the decision tree classifier model, fitting the model on training data and making predictions on validation data
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_pred_valid = dt.predict(X_valid)
fg_ac = accuracy_score(y_valid,y_pred_valid)
fg_f1 = f1_score(y_valid,y_pred_valid)
fg_auroc = roc_auc_score(y_valid,y_pred_valid)
print('The accuracy of the fully-grown decision tree is:',fg_ac)
print('The F1-score of the fully-grown decision tree is:',fg_f1)
print('The AUROC of the fully-grown decision tree is:',fg_auroc)

The accuracy of the fully-grown decision tree is: 0.6153846153846154
The F1-score of the fully-grown decision tree is: 0.7161572052401747
The AUROC of the fully-grown decision tree is: 0.5599574984820886


#### ii) Using pruned decision tree model

In [187]:
path = dt.cost_complexity_pruning_path(X_train,y_train)
alphas = path['ccp_alphas']

In [188]:
# Fitting the decision tree model for different values of tuning parameter and making predictions on validation data 
valid_f1_score = {}
valid_auc = {}
for i in alphas:
    dt = DecisionTreeClassifier(ccp_alpha=i)
    dt.fit(X_train,y_train)
    y_pred_valid = dt.predict(X_valid)
    valid_f1_score[i] = f1_score(y_valid,y_pred_valid)
    valid_auc[i] = roc_auc_score(y_valid,y_pred_valid)

In [189]:
# Finding the optimal value of alpha(tuning parameter) for which the modified cost is minimal (using f1_score as the performance metric)
optimal_alpha = max(valid_f1_score,key=valid_f1_score.get)
optimal_alpha

0.008484204384498345

In [190]:
# Instatiating a pruned decision tree classifier with the optimal value of alpha, fitting the model on training data and making predictions on the validation data
dt = DecisionTreeClassifier(ccp_alpha=optimal_alpha)
dt.fit(X_train,y_train)
y_pred_valid = dt.predict(X_valid)
pr_ac = accuracy_score(y_valid,y_pred_valid)
pr_f1 = f1_score(y_valid,y_pred_valid)
pr_auroc = roc_auc_score(y_valid,y_pred_valid)
print('The accuracy of the pruned decision tree with optimal ccp is:',pr_ac)
print('The F1-score of the pruned decision tree with optimal ccp is:',pr_f1)
print('The AUROC of the pruned decision tree with optimal ccp is:',pr_auroc)

The accuracy of the pruned decision tree with optimal ccp is: 0.650887573964497
The F1-score of the pruned decision tree with optimal ccp is: 0.7822878228782287
The AUROC of the pruned decision tree with optimal ccp is: 0.5235276259866424


#### iii) Using random forest classifier model

In [191]:
# Instantiating the random forest classifier model, fitting the model on training data and making predictions on validation data
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_valid_pred_rf = rf.predict(X_valid)
rf_ac = accuracy_score(y_valid,y_valid_pred_rf)
rf_f1 = f1_score(y_valid,y_valid_pred_rf)
rf_auroc = roc_auc_score(y_valid,y_valid_pred_rf)
print('The accuracy of the Random Forest Classifier is:',rf_ac)
print('The F1-score of the Random Forest Classifier is::',rf_f1)
print('The AUROC of the Random Forest Classifier is:',rf_auroc)

The accuracy of the Random Forest Classifier is: 0.6449704142011834
The F1-score of the Random Forest Classifier is:: 0.7169811320754716
The AUROC of the Random Forest Classifier is: 0.6223436551305404


#### iv) Using bagged classifier model

In [192]:
# Instantiating the bagged classifier model, fitting the model on training data and making predictions on validation data
bcf = BaggingClassifier()
bcf.fit(X_train,y_train)
y_valid_pred_bcf = bcf.predict(X_valid)
bcf_ac = accuracy_score(y_valid,y_valid_pred_bcf)
bcf_f1 = f1_score(y_valid,y_valid_pred_bcf)
bcf_auroc = roc_auc_score(y_valid,y_valid_pred_bcf)
print('The accuracy of the Random Forest Classifier is:',bcf_ac)
print('The F1-score of the Random Forest Classifier is::',bcf_f1)
print('The AUROC of the Random Forest Classifier is:',bcf_auroc)

The accuracy of the Random Forest Classifier is: 0.6153846153846154
The F1-score of the Random Forest Classifier is:: 0.6829268292682927
The AUROC of the Random Forest Classifier is: 0.602762598664238


#### vi) Using adaboost classifier model

In [193]:
# Instantiating the adaboost classifier model, fitting the model on training data and making predictions on validation data
abc = AdaBoostClassifier()
abc.fit(X_train,y_train)
y_valid_pred_abc = abc.predict(X_valid)
abc_ac = accuracy_score(y_valid,y_valid_pred_abc)
abc_f1 = f1_score(y_valid,y_valid_pred_abc)
abc_auroc = roc_auc_score(y_valid,y_valid_pred_abc)
print('The accuracy of the ADABoost model is:',abc_ac)
print('The F1-score of the ADABoost model is:',abc_f1)
print('The AUROC of the ADABoost model is:',abc_auroc)

The accuracy of the ADABoost model is: 0.6627218934911243
The F1-score of the ADABoost model is: 0.7710843373493975
The AUROC of the ADABoost model is: 0.5755919854280509


#### Comparing the performance of the various models used

In [194]:
performance = pd.DataFrame(columns=['Standard Decision Tree','Pruned Decision Tree','Bagging Classifier','Random Forest','AdaBoost'], index = ['Accuracy','F-1 score','AUROC'])
performance

Unnamed: 0,Standard Decision Tree,Pruned Decision Tree,Bagging Classifier,Random Forest,AdaBoost
Accuracy,,,,,
F-1 score,,,,,
AUROC,,,,,


In [195]:
performance_list = np.array([[fg_ac,fg_f1,fg_auroc],[pr_ac,pr_f1,pr_auroc],[bcf_ac,bcf_f1,bcf_auroc],[rf_ac,rf_f1,rf_auroc],[abc_ac,abc_f1,abc_auroc]])
performance_list

array([[0.61538462, 0.71615721, 0.5599575 ],
       [0.65088757, 0.78228782, 0.52352763],
       [0.61538462, 0.68292683, 0.6027626 ],
       [0.64497041, 0.71698113, 0.62234366],
       [0.66272189, 0.77108434, 0.57559199]])

In [196]:
for i,j in zip(list(performance.columns),np.arange(0,5)):
    performance[i] = performance_list[j]
performance

Unnamed: 0,Standard Decision Tree,Pruned Decision Tree,Bagging Classifier,Random Forest,AdaBoost
Accuracy,0.615385,0.650888,0.615385,0.64497,0.662722
F-1 score,0.716157,0.782288,0.682927,0.716981,0.771084
AUROC,0.559957,0.523528,0.602763,0.622344,0.575592


#### INFERENCE : 
Based on F-1 score, **the pruned decision tree machine learning model using TF-IDF(term-frequency inverse 
document frequency)** has outperformed all the machine learning models that used word2vec technique.

Based on AUROC score, **the bagging classifier machine learning model using word2vec** has outperformed all the machine learning models that used TFIDF technique.