### Importing Libraries

In [101]:
import pandas as pd
import re
import numpy as np

from nltk.tokenize import TweetTokenizer
from nltk import PorterStemmer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

### Reading Training Dataset



We are using _ as a seperator/ delimeter

In [102]:
trainData = pd.read_csv("TrainData.csv","_")

trainData

Unnamed: 0,Index,Comment,Polarity
0,0,time to eat with my best buddy! #lunch,Happy
1,1,@user @user if they want reflection money. #ksleg,Happy
2,2,---Good job but I’ will expect a lot more in f...,Happy
3,3,totally dissatisfied with the service###%%@@ n...,Sad
4,4,loved my work!!!!!,Happy
5,5,Worst customer care service......@@$$$angry,Sad
6,6,Brilliant effort guys!!!,Happy
7,7,@user @user you point one finger @user million...,Sad
8,8,"words r free, it's how u use that can cost you...",Happy
9,9,you might be a libtard if... #libtard #sjw #li...,Sad


### Reading Test Dataset

In [103]:
testData = pd.read_csv("TestData.csv","_")

testData

Unnamed: 0,Index,Comment,Polarity
0,0,@use the pic says otherwise for young girls co...,Sad
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,Happy
2,2,@user when you're blocked by a troll because y...,Sad
3,3,dinner with sister!!,Happy
4,4,who else is planning on watching @user tomorrow?,happy


# PreProcessing Begins

In [104]:
#Function to remove @user in the data

def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and creating list
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text



In [105]:
#Applying function to both datasets for removing @user

trainData['Processed_Comment'] = np.vectorize(remove_pattern)(trainData['Comment'], "@[\w]*")

testData['Processed_Comment'] = np.vectorize(remove_pattern)(testData['Comment'], "@[\w]*")


In [106]:
# Removing everything except text i.e letters/words

trainData['Processed_Comment'] = trainData['Processed_Comment'].str.replace("[^a-zA-Z]", " ")

trainData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,time to eat with my best buddy! #lunch,Happy,time to eat with my best buddy lunch
1,1,@user @user if they want reflection money. #ksleg,Happy,if they want reflection money ksleg
2,2,---Good job but I’ will expect a lot more in f...,Happy,Good job but I will expect a lot more in f...
3,3,totally dissatisfied with the service###%%@@ n...,Sad,totally dissatisfied with the service nev...
4,4,loved my work!!!!!,Happy,loved my work
5,5,Worst customer care service......@@$$$angry,Sad,Worst customer care service angry
6,6,Brilliant effort guys!!!,Happy,Brilliant effort guys
7,7,@user @user you point one finger @user million...,Sad,you point one finger millions are pointed r...
8,8,"words r free, it's how u use that can cost you...",Happy,words r free it s how u use that can cost you...
9,9,you might be a libtard if... #libtard #sjw #li...,Sad,you might be a libtard if libtard sjw li...


In [107]:
# Removing everything except text i.e letters/words

testData['Processed_Comment'] = testData['Processed_Comment'].str.replace("[^a-zA-Z]", " ")

testData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,@use the pic says otherwise for young girls co...,Sad,the pic says otherwise for young girls confin...
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,Happy,good night faith ever vaitacacommafiasdv
2,2,@user when you're blocked by a troll because y...,Sad,when you re blocked by a troll because you pr...
3,3,dinner with sister!!,Happy,dinner with sister
4,4,who else is planning on watching @user tomorrow?,happy,who else is planning on watching tomorrow


In [108]:
#Removing Short Words
trainData['Processed_Comment'] = trainData['Processed_Comment'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

trainData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,time to eat with my best buddy! #lunch,Happy,time with best buddy lunch
1,1,@user @user if they want reflection money. #ksleg,Happy,they want reflection money ksleg
2,2,---Good job but I’ will expect a lot more in f...,Happy,Good will expect more future
3,3,totally dissatisfied with the service###%%@@ n...,Sad,totally dissatisfied with service never used t...
4,4,loved my work!!!!!,Happy,loved work
5,5,Worst customer care service......@@$$$angry,Sad,Worst customer care service angry
6,6,Brilliant effort guys!!!,Happy,Brilliant effort guys
7,7,@user @user you point one finger @user million...,Sad,point finger millions pointed right back jewis...
8,8,"words r free, it's how u use that can cost you...",Happy,words free that cost verbal abuse love adult teen
9,9,you might be a libtard if... #libtard #sjw #li...,Sad,might libtard libtard liberal politics


In [109]:
#Removing Short Words
testData['Processed_Comment'] = testData['Processed_Comment'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

testData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,@use the pic says otherwise for young girls co...,Sad,says otherwise young girls confined that kitch...
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,Happy,good night faith ever vaitacacommafiasdv
2,2,@user when you're blocked by a troll because y...,Sad,when blocked troll because promise blacklivesm...
3,3,dinner with sister!!,Happy,dinner with sister
4,4,who else is planning on watching @user tomorrow?,happy,else planning watching tomorrow


In [110]:
trainData['Polarity'] = trainData['Polarity'].apply(lambda x: x.capitalize())

trainData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,time to eat with my best buddy! #lunch,Happy,time with best buddy lunch
1,1,@user @user if they want reflection money. #ksleg,Happy,they want reflection money ksleg
2,2,---Good job but I’ will expect a lot more in f...,Happy,Good will expect more future
3,3,totally dissatisfied with the service###%%@@ n...,Sad,totally dissatisfied with service never used t...
4,4,loved my work!!!!!,Happy,loved work
5,5,Worst customer care service......@@$$$angry,Sad,Worst customer care service angry
6,6,Brilliant effort guys!!!,Happy,Brilliant effort guys
7,7,@user @user you point one finger @user million...,Sad,point finger millions pointed right back jewis...
8,8,"words r free, it's how u use that can cost you...",Happy,words free that cost verbal abuse love adult teen
9,9,you might be a libtard if... #libtard #sjw #li...,Sad,might libtard libtard liberal politics


In [111]:
testData['Polarity'] = testData['Polarity'].apply(lambda x: x.capitalize())

testData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,@use the pic says otherwise for young girls co...,Sad,says otherwise young girls confined that kitch...
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,Happy,good night faith ever vaitacacommafiasdv
2,2,@user when you're blocked by a troll because y...,Sad,when blocked troll because promise blacklivesm...
3,3,dinner with sister!!,Happy,dinner with sister
4,4,who else is planning on watching @user tomorrow?,Happy,else planning watching tomorrow


### Label Encoding for Train/Test Data

In [112]:
def labelEncoder(polarity):
    if(polarity == 'Happy'):
        return 1
    return 0

In [113]:
trainData['Polarity'] = trainData['Polarity'].apply(lambda x: labelEncoder(x))

trainData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,time to eat with my best buddy! #lunch,1,time with best buddy lunch
1,1,@user @user if they want reflection money. #ksleg,1,they want reflection money ksleg
2,2,---Good job but I’ will expect a lot more in f...,1,Good will expect more future
3,3,totally dissatisfied with the service###%%@@ n...,0,totally dissatisfied with service never used t...
4,4,loved my work!!!!!,1,loved work
5,5,Worst customer care service......@@$$$angry,0,Worst customer care service angry
6,6,Brilliant effort guys!!!,1,Brilliant effort guys
7,7,@user @user you point one finger @user million...,0,point finger millions pointed right back jewis...
8,8,"words r free, it's how u use that can cost you...",1,words free that cost verbal abuse love adult teen
9,9,you might be a libtard if... #libtard #sjw #li...,0,might libtard libtard liberal politics


In [114]:
testData['Polarity'] = testData['Polarity'].apply(lambda x: labelEncoder(x))

testData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,@use the pic says otherwise for young girls co...,0,says otherwise young girls confined that kitch...
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,1,good night faith ever vaitacacommafiasdv
2,2,@user when you're blocked by a troll because y...,0,when blocked troll because promise blacklivesm...
3,3,dinner with sister!!,1,dinner with sister
4,4,who else is planning on watching @user tomorrow?,1,else planning watching tomorrow


### Tokanizing Tweets of train data

In [115]:
tokenized_trainComment = trainData['Processed_Comment'].apply(lambda x: x.split())

tokenized_trainComment

0                     [time, with, best, buddy, lunch]
1               [they, want, reflection, money, ksleg]
2                   [Good, will, expect, more, future]
3    [totally, dissatisfied, with, service, never, ...
4                                        [loved, work]
5              [Worst, customer, care, service, angry]
6                            [Brilliant, effort, guys]
7    [point, finger, millions, pointed, right, back...
8    [words, free, that, cost, verbal, abuse, love,...
9         [might, libtard, libtard, liberal, politics]
Name: Processed_Comment, dtype: object

#### Removing additional letters such as ed, 's etc.

In [116]:
ps = PorterStemmer()

tokenized_trainComment = tokenized_trainComment.apply(lambda x: [ps.stem(i) for i in x])

tokenized_trainComment

0                     [time, with, best, buddi, lunch]
1                  [they, want, reflect, money, ksleg]
2                    [good, will, expect, more, futur]
3    [total, dissatisfi, with, servic, never, use, ...
4                                         [love, work]
5                 [worst, custom, care, servic, angri]
6                             [brilliant, effort, guy]
7    [point, finger, million, point, right, back, j...
8    [word, free, that, cost, verbal, abus, love, a...
9              [might, libtard, libtard, liber, polit]
Name: Processed_Comment, dtype: object

#### Replacing old Processed comments

In [117]:
for i in range(len(tokenized_trainComment)):
    tokenized_trainComment[i] = ' '.join(tokenized_trainComment[i])

trainData['Processed_Comment'] = tokenized_trainComment

trainData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,time to eat with my best buddy! #lunch,1,time with best buddi lunch
1,1,@user @user if they want reflection money. #ksleg,1,they want reflect money ksleg
2,2,---Good job but I’ will expect a lot more in f...,1,good will expect more futur
3,3,totally dissatisfied with the service###%%@@ n...,0,total dissatisfi with servic never use thi ser...
4,4,loved my work!!!!!,1,love work
5,5,Worst customer care service......@@$$$angry,0,worst custom care servic angri
6,6,Brilliant effort guys!!!,1,brilliant effort guy
7,7,@user @user you point one finger @user million...,0,point finger million point right back jewishsu...
8,8,"words r free, it's how u use that can cost you...",1,word free that cost verbal abus love adult teen
9,9,you might be a libtard if... #libtard #sjw #li...,0,might libtard libtard liber polit


### Tokanizing Tweets of test data

In [119]:
tokenized_testComment = testData["Processed_Comment"].apply(lambda x: x.split())

tokenized_testComment


0    [says, otherwise, young, girls, confined, that...
1       [good, night, faith, ever, vaitacacommafiasdv]
2    [when, blocked, troll, because, promise, black...
3                               [dinner, with, sister]
4                 [else, planning, watching, tomorrow]
Name: Processed_Comment, dtype: object

#### Removing additional letters such as ed, 's etc.

In [124]:
ps = PorterStemmer()

tokenized_testComment = tokenized_testComment.apply(lambda x : [ps.stem(i) for i in x])


tokenized_testComment

0    [say, otherwi, young, girl, confin, that, kitc...
1       [good, night, faith, ever, vaitacacommafiasdv]
2    [when, block, troll, becau, promi, blacklivesm...
3                               [dinner, with, sister]
4                          [el, plan, watch, tomorrow]
Name: Processed_Comment, dtype: object

#### Replacing old Processed comments


In [125]:

for i in range(len(tokenized_testComment)):
    tokenized_testComment[i] = ' '.join(tokenized_testComment[i])
    
testData['Processed_Comment'] = tokenized_testComment
    
testData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment,3
0,0,@use the pic says otherwise for young girls co...,0,says otherwise young girls confined that kitch...,
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,1,good night faith ever vaitacacommafiasdv,
2,2,@user when you're blocked by a troll because y...,0,when blocked troll because promise blacklivesm...,
3,3,dinner with sister!!,1,dinner with sister,
4,4,who else is planning on watching @user tomorrow?,1,else planning watching tomorrow,


### Feature Extraction from Train Data

#### Bag of Words

In [118]:
cv= CountVectorizer()

bag_of_words = cv.fit_transform(trainData['Processed_Comment']).toarray()

print(cv.vocabulary_)

print(cv.get_feature_names())
print('\n')
print(bag_of_words)


{'time': 39, 'with': 45, 'best': 5, 'buddi': 7, 'lunch': 24, 'they': 37, 'want': 43, 'reflect': 32, 'money': 27, 'ksleg': 20, 'good': 17, 'will': 44, 'expect': 13, 'more': 28, 'futur': 16, 'total': 40, 'dissatisfi': 11, 'servic': 34, 'never': 29, 'use': 41, 'thi': 38, 'again': 2, 'love': 23, 'work': 47, 'worst': 48, 'custom': 10, 'care': 8, 'angri': 3, 'brilliant': 6, 'effort': 12, 'guy': 18, 'point': 30, 'finger': 14, 'million': 26, 'right': 33, 'back': 4, 'jewishsupremacist': 19, 'word': 46, 'free': 15, 'that': 36, 'cost': 9, 'verbal': 42, 'abus': 0, 'adult': 1, 'teen': 35, 'might': 25, 'libtard': 22, 'liber': 21, 'polit': 31}
['abus', 'adult', 'again', 'angri', 'back', 'best', 'brilliant', 'buddi', 'care', 'cost', 'custom', 'dissatisfi', 'effort', 'expect', 'finger', 'free', 'futur', 'good', 'guy', 'jewishsupremacist', 'ksleg', 'liber', 'libtard', 'love', 'lunch', 'might', 'million', 'money', 'more', 'never', 'point', 'polit', 'reflect', 'right', 'servic', 'teen', 'that', 'they', 't

### Feature Extraction for Test Data

#### Bag of Words

In [128]:
cv= CountVectorizer()

bag_of_words = cv.fit_transform(testData['Processed_Comment']).toarray()

print(cv.vocabulary_)

print(cv.get_feature_names())
print('\n')
print(bag_of_words)


{'says': 21, 'otherwise': 16, 'young': 32, 'girls': 10, 'confined': 5, 'that': 23, 'kitchen': 12, 'void': 28, 'meaning': 13, 'beyond': 1, 'cheap': 4, 'publicity': 19, 'topoli': 25, 'good': 11, 'night': 14, 'faith': 9, 'ever': 8, 'vaitacacommafiasdv': 27, 'when': 30, 'blocked': 3, 'troll': 26, 'because': 0, 'promise': 18, 'blacklivesmatter': 2, 'nonsensical': 15, 'rants': 20, 'dinner': 6, 'with': 31, 'sister': 22, 'else': 7, 'planning': 17, 'watching': 29, 'tomorrow': 24}
['because', 'beyond', 'blacklivesmatter', 'blocked', 'cheap', 'confined', 'dinner', 'else', 'ever', 'faith', 'girls', 'good', 'kitchen', 'meaning', 'night', 'nonsensical', 'otherwise', 'planning', 'promise', 'publicity', 'rants', 'says', 'sister', 'that', 'tomorrow', 'topoli', 'troll', 'vaitacacommafiasdv', 'void', 'watching', 'when', 'with', 'young']


[[0 1 0 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 1]
 [0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [1 0 1 1 0 0 0 0 0 0 0 0 0 0 