In [1]:
import warnings
warnings.filterwarnings("ignore")

import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import word_tokenize
from mlxtend.frequent_patterns import apriori
import requests
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt

In [3]:
import os

In [4]:
os.chdir("E:\\DOWNLOads")

# TF-IDF

### Tweet Data

In [7]:
tweets = pd.read_csv("tweets.csv")

In [8]:
tweets.head()

Unnamed: 0,Tweet,Avg
0,"I have to say, Apple has by far the best custo...",2.0
1,iOS 7 is so fricking smooth & beautiful!! #Tha...,2.0
2,LOVE U @APPLE,1.8
3,"Thank you @apple, loving my new iPhone 5S!!!!!...",1.8
4,.@apple has the best customer service. In and ...,1.8


### Test Pre-processing

#### 1. Converting the text into lower cases

In [9]:
#normalizing all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = tweets.Tweet[i].lower()

In [10]:
tweets.head()

Unnamed: 0,Tweet,Avg
0,"i have to say, apple has by far the best custo...",2.0
1,ios 7 is so fricking smooth & beautiful!! #tha...,2.0
2,love u @apple,1.8
3,"thank you @apple, loving my new iphone 5s!!!!!...",1.8
4,.@apple has the best customer service. in and ...,1.8


#### 2. Remove stopwords (and the word "apple")

In [11]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\vset
[nltk_data]     Info\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
#NLTK stopword list
stop_words = stopwords.words("english")
len(stop_words)

179

In [13]:
#Removing stopwords from all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = " ".join([w for w in tweets.Tweet[i].split() if w not in stop_words])

In [14]:
tweets.head()

Unnamed: 0,Tweet,Avg
0,"say, apple far best customer care service ever...",2.0
1,ios 7 fricking smooth & beautiful!! #thanxappl...,2.0
2,love u @apple,1.8
3,"thank @apple, loving new iphone 5s!!!!! #apple...",1.8
4,.@apple best customer service. new phone 10min!,1.8


#### 3. Remove Punctuations/special symbols and 

In [15]:
#Removing punctuation from a single tweet
import string
p = string.punctuation
remv_punc = str.maketrans("", "", p)

In [16]:
#Removing punctuation from all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = tweets.Tweet[i].translate(remv_punc)

In [17]:
tweets.head()

Unnamed: 0,Tweet,Avg
0,say apple far best customer care service ever ...,2.0
1,ios 7 fricking smooth beautiful thanxapple apple,2.0
2,love u apple,1.8
3,thank apple loving new iphone 5s apple iphone5...,1.8
4,apple best customer service new phone 10min,1.8


#### 4. Remove the word "apple"

In [18]:
#Removing the word "apple" from all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = " ".join([w for w in tweets.Tweet[i].split() if w != "apple"])

#### 5. Remove white spaces

In [19]:
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = tweets.Tweet[i].replace("  ", " ").strip()

#### 5. Stemming

In [20]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [21]:
#Remove stopwords from all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = " ".join([ps.stem(w) for w in tweets.Tweet[i].split()])

In [22]:
tweets.head()

Unnamed: 0,Tweet,Avg
0,say far best custom care servic ever receiv ap...,2.0
1,io 7 frick smooth beauti thanxappl,2.0
2,love u,1.8
3,thank love new iphon 5s iphone5 pictwittercomx...,1.8
4,best custom servic new phone 10min,1.8


In [23]:
tweets.shape

(1181, 2)

### Document-Term Matrix

#### 1. Get the Bag-Of-Words (BOW) Dataframe with TF-IDF vectorizor

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
#TF-IDF vectorizer for Train, Validation and Test data
tfidf_vectorizer=TfidfVectorizer()
tfidf_vectors=tfidf_vectorizer.fit_transform(tweets.Tweet)


In [26]:
names=tfidf_vectorizer.get_feature_names()

In [27]:
#Converting the above compressed objects into numpy array

tfidf_vectors=tfidf_vectors.toarray()


In [28]:
#Converting the above arrays into data frames with proper column names
tfidf_vectors=pd.DataFrame(tfidf_vectors,columns=names)


In [29]:
tfidf_vectors

Unnamed: 0,000,075,0909,0910,099,10,100,100m,1085,10min,...,yourthought,youtub,yu,yurbud,z10,ze,zifmstereo,zimmerman,zippo,zostac
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.569149,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 2. Dimension Reduction

In [30]:
#using count vectoroizer to create a document-term matrix
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [31]:
X=cv.fit_transform(tweets.Tweet).toarray()
X.shape

(1181, 3475)

In [32]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
DTM=pd.DataFrame(X,columns=cv.get_feature_names())

#### REDUCING COLUMNS

In [34]:
for col in list(DTM):
    prop = DTM[col].sum()/DTM.shape[0]
    if prop*100 < 1:
        DTM = DTM.drop([col],axis=1)

In [35]:
DTM.shape

(1181, 135)

In [36]:
seperated_variables=list(DTM)
#seperated_variables


#### 3. Train-Validation-Test Split

In [37]:
#Creating the target variable
y = tweets.Avg <= -1
tfidf_vectors=tfidf_vectors[seperated_variables]

In [38]:
#For test-train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, y, test_size = 0.2, stratify=y)

In [39]:
#Train-Validation Split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, stratify=y_train)

In [40]:
X_valid.head(3)

Unnamed: 0,5c,5s,android,announc,anyon,app,appl,back,batteri,best,...,way,well,wish,work,would,wtf,yall,year,yet,you
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Classification Model

#### 2. Classification Tree

In [41]:
#For fitting regression tree
from sklearn.tree import DecisionTreeClassifier

#For plotting a decision tree
from sklearn import tree

In [42]:
#Classification Tree Model Fitting
DC=DecisionTreeClassifier()


In [43]:
DC.fit(X_train,y_train)

DecisionTreeClassifier()

In [44]:
y_pred=DC.predict(X_valid)

In [45]:
from sklearn.metrics import accuracy_score

In [46]:
accuracy_score(y_pred,y_valid)

0.8347457627118644

### Prunned DTC

In [47]:
#Classification Tree Model Fitting
DPC=DecisionTreeClassifier(ccp_alpha=0.05)


In [48]:
DPC.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.05)

In [49]:
y_pred=DPC.predict(X_valid)

In [50]:
accuracy_score(y_pred,y_valid)

0.8686440677966102

# WordToVec Word Embedding

### 2. Get word embeddings

In [51]:
tweets = pd.read_csv("tweets.csv")

In [52]:
tweets.head()

Unnamed: 0,Tweet,Avg
0,"I have to say, Apple has by far the best custo...",2.0
1,iOS 7 is so fricking smooth & beautiful!! #Tha...,2.0
2,LOVE U @APPLE,1.8
3,"Thank you @apple, loving my new iPhone 5S!!!!!...",1.8
4,.@apple has the best customer service. In and ...,1.8


### Test Pre-processing

#### 1. Converting the text into lower cases

In [53]:
#normalizing all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = tweets.Tweet[i].lower()

In [54]:
tweets.head()

Unnamed: 0,Tweet,Avg
0,"i have to say, apple has by far the best custo...",2.0
1,ios 7 is so fricking smooth & beautiful!! #tha...,2.0
2,love u @apple,1.8
3,"thank you @apple, loving my new iphone 5s!!!!!...",1.8
4,.@apple has the best customer service. in and ...,1.8


#### 2. Remove stopwords (and the word "apple")

In [55]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\vset
[nltk_data]     Info\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
#NLTK stopword list
stop_words = stopwords.words("english")
len(stop_words)

179

In [57]:
#Removing stopwords from all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = " ".join([w for w in tweets.Tweet[i].split() if w not in stop_words])

In [58]:
tweets.head()

Unnamed: 0,Tweet,Avg
0,"say, apple far best customer care service ever...",2.0
1,ios 7 fricking smooth & beautiful!! #thanxappl...,2.0
2,love u @apple,1.8
3,"thank @apple, loving new iphone 5s!!!!! #apple...",1.8
4,.@apple best customer service. new phone 10min!,1.8


#### 3. Remove Punctuations/special symbols and 

In [59]:
#Removing punctuation from a single tweet
import string
p = string.punctuation
remv_punc = str.maketrans("", "", p)

In [60]:
#Removing punctuation from all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = tweets.Tweet[i].translate(remv_punc)

In [61]:
tweets.head()

Unnamed: 0,Tweet,Avg
0,say apple far best customer care service ever ...,2.0
1,ios 7 fricking smooth beautiful thanxapple apple,2.0
2,love u apple,1.8
3,thank apple loving new iphone 5s apple iphone5...,1.8
4,apple best customer service new phone 10min,1.8


#### 4. Remove the word "apple"

In [62]:
#Removing the word "apple" from all the tweets
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = " ".join([w for w in tweets.Tweet[i].split() if w != "apple"])

#### 5. Remove white spaces

In [63]:
for i in range(len(tweets.Tweet)):
    tweets.Tweet[i] = tweets.Tweet[i].replace("  ", " ").strip()

In [64]:
import nltk
from gensim.models import KeyedVectors

In [65]:
embeddings = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)

In [66]:
embeddings.index_to_key


['</s>',
 'in',
 'for',
 'that',
 'is',
 'on',
 '##',
 'The',
 'with',
 'said',
 'was',
 'the',
 'at',
 'not',
 'as',
 'it',
 'be',
 'from',
 'by',
 'are',
 'I',
 'have',
 'he',
 'will',
 'has',
 '####',
 'his',
 'an',
 'this',
 'or',
 'their',
 'who',
 'they',
 'but',
 '$',
 'had',
 'year',
 'were',
 'we',
 'more',
 '###',
 'up',
 'been',
 'you',
 'its',
 'one',
 'about',
 'would',
 'which',
 'out',
 'can',
 'It',
 'all',
 'also',
 'two',
 'after',
 'first',
 'He',
 'do',
 'time',
 'than',
 'when',
 'We',
 'over',
 'last',
 'new',
 'other',
 'her',
 'people',
 'into',
 'In',
 'our',
 'there',
 'A',
 'she',
 'could',
 'just',
 'years',
 'some',
 'U.S.',
 'three',
 'million',
 'them',
 'what',
 'But',
 'so',
 'no',
 'like',
 'if',
 'only',
 'percent',
 'get',
 'did',
 'him',
 'game',
 'back',
 'because',
 'now',
 '#.#',
 'before',
 'company',
 'any',
 'team',
 'against',
 'off',
 'This',
 'most',
 'made',
 'through',
 'make',
 'second',
 'state',
 'well',
 'day',
 'season',
 'says',
 'w

In [67]:
#((embeddings[tweets.Tweet[0].split()[0]]+embeddings[tweets.Tweet[0].split()[1]]+embeddings[tweets.Tweet[0].split()[6]]))
dfs=[]
for i in tweets.Tweet:
    x=(i.split())
    ar=np.zeros(300)
    for j in x:
        if j in embeddings.index_to_key:
            ar+=embeddings[j]
    dfs.append(ar)  

In [68]:
X=pd.DataFrame(np.row_stack(dfs))


In [69]:
Y=tweets.Avg<=-1


# Training data On Word to Vec

In [70]:
#For test-train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, stratify=y)

In [71]:
#Train-Validation Split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, stratify=y_train)

In [72]:
X_valid.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
855,0.220276,0.484375,-0.74585,1.414673,-1.508675,0.848999,0.460205,0.095398,1.107239,0.522034,...,0.601898,1.650513,-1.20343,-0.603271,0.001587,-0.078861,-1.232574,-1.668365,-0.560394,-0.015301
1068,-0.192078,-0.180664,0.414368,0.233368,-1.099243,0.47052,1.20459,-0.255371,0.713501,0.567871,...,-0.279114,0.543091,-0.394531,-0.001953,0.326782,-0.140198,-0.232544,-0.772156,-0.406494,0.697754
159,-0.461777,0.339691,-1.227615,0.425171,0.017281,-0.073486,-0.335052,-0.890015,0.677124,1.119141,...,-0.065186,-0.349365,-0.094604,0.211533,0.040161,-1.178711,-0.295959,-0.617432,-0.524658,-0.081726


### Classification Model

#### 2. Classification Tree

In [73]:
#Classification Tree Model Fitting
DC_word_vec=DecisionTreeClassifier()


In [74]:
DC_word_vec.fit(X_train,y_train)

DecisionTreeClassifier()

In [75]:
y_pred=DC_word_vec.predict(X_valid)

In [76]:
accuracy_score(y_pred,y_valid)

0.7627118644067796

# WITH PRUNNING

In [77]:
#Classification Tree Model Fitting
DC_word_vec_prun=DecisionTreeClassifier(ccp_alpha=0.05)


In [78]:
DC_word_vec_prun.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.05)

In [79]:
y_pred=DC_word_vec_prun.predict(X_valid)

In [80]:
accuracy_score(y_pred,y_valid)

0.847457627118644

# CONCLUSION

#### Accuracy of TF-IDF without prunning model is greater than accuracy of WordToVec without prunning model

#### Accuracy of TF-IDF prunning model is greater than accuracy of WordToVec  prunning model