### Importing Libraries

In [44]:
import pandas as pd
import re
import numpy as np
import nltk

from nltk.tokenize import TweetTokenizer
from nltk import PorterStemmer
from nltk import ngrams

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


### Reading Training Dataset



We are using _ as a seperator/ delimeter

In [2]:
trainData = pd.read_csv("TrainData.csv","_")

trainData

Unnamed: 0,Index,Comment,Polarity
0,0,time to eat with my best buddy! #lunch,Happy
1,1,@user @user if they want reflection money. #ksleg,Happy
2,2,---Good job but I’ will expect a lot more in f...,Happy
3,3,totally dissatisfied with the service###%%@@ n...,Sad
4,4,loved my work!!!!!,Happy
5,5,Worst customer care service......@@$$$angry,Sad
6,6,Brilliant effort guys!!!,Happy
7,7,@user @user you point one finger @user million...,Sad
8,8,"words r free, it's how u use that can cost you...",Happy
9,9,you might be a libtard if... #libtard #sjw #li...,Sad


### Reading Test Dataset

In [3]:
testData = pd.read_csv("TestData.csv","_")

testData

Unnamed: 0,Index,Comment,Polarity
0,0,@use the pic says otherwise for young girls co...,Sad
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,Happy
2,2,@user when you're blocked by a troll because y...,Sad
3,3,dinner with sister!!,Happy
4,4,who else is planning on watching @user tomorrow?,happy


# PreProcessing Begins

In [4]:
#Function to remove @user in the data

def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and creating list
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text



In [5]:
#Applying function to both datasets for removing @user

trainData['Processed_Comment'] = np.vectorize(remove_pattern)(trainData['Comment'], "@[\w]*")

testData['Processed_Comment'] = np.vectorize(remove_pattern)(testData['Comment'], "@[\w]*")


In [6]:
# Removing everything except text i.e letters/words

trainData['Processed_Comment'] = trainData['Processed_Comment'].str.replace("[^a-zA-Z]", " ")

trainData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,time to eat with my best buddy! #lunch,Happy,time to eat with my best buddy lunch
1,1,@user @user if they want reflection money. #ksleg,Happy,if they want reflection money ksleg
2,2,---Good job but I’ will expect a lot more in f...,Happy,Good job but I will expect a lot more in f...
3,3,totally dissatisfied with the service###%%@@ n...,Sad,totally dissatisfied with the service nev...
4,4,loved my work!!!!!,Happy,loved my work
5,5,Worst customer care service......@@$$$angry,Sad,Worst customer care service angry
6,6,Brilliant effort guys!!!,Happy,Brilliant effort guys
7,7,@user @user you point one finger @user million...,Sad,you point one finger millions are pointed r...
8,8,"words r free, it's how u use that can cost you...",Happy,words r free it s how u use that can cost you...
9,9,you might be a libtard if... #libtard #sjw #li...,Sad,you might be a libtard if libtard sjw li...


In [7]:
# Removing everything except text i.e letters/words

testData['Processed_Comment'] = testData['Processed_Comment'].str.replace("[^a-zA-Z]", " ")

testData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,@use the pic says otherwise for young girls co...,Sad,the pic says otherwise for young girls confin...
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,Happy,good night faith ever vaitacacommafiasdv
2,2,@user when you're blocked by a troll because y...,Sad,when you re blocked by a troll because you pr...
3,3,dinner with sister!!,Happy,dinner with sister
4,4,who else is planning on watching @user tomorrow?,happy,who else is planning on watching tomorrow


In [8]:
#Removing Short Words
trainData['Processed_Comment'] = trainData['Processed_Comment'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

trainData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,time to eat with my best buddy! #lunch,Happy,time with best buddy lunch
1,1,@user @user if they want reflection money. #ksleg,Happy,they want reflection money ksleg
2,2,---Good job but I’ will expect a lot more in f...,Happy,Good will expect more future
3,3,totally dissatisfied with the service###%%@@ n...,Sad,totally dissatisfied with service never used t...
4,4,loved my work!!!!!,Happy,loved work
5,5,Worst customer care service......@@$$$angry,Sad,Worst customer care service angry
6,6,Brilliant effort guys!!!,Happy,Brilliant effort guys
7,7,@user @user you point one finger @user million...,Sad,point finger millions pointed right back jewis...
8,8,"words r free, it's how u use that can cost you...",Happy,words free that cost verbal abuse love adult teen
9,9,you might be a libtard if... #libtard #sjw #li...,Sad,might libtard libtard liberal politics


In [9]:
#Removing Short Words
testData['Processed_Comment'] = testData['Processed_Comment'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

testData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,@use the pic says otherwise for young girls co...,Sad,says otherwise young girls confined that kitch...
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,Happy,good night faith ever vaitacacommafiasdv
2,2,@user when you're blocked by a troll because y...,Sad,when blocked troll because promise blacklivesm...
3,3,dinner with sister!!,Happy,dinner with sister
4,4,who else is planning on watching @user tomorrow?,happy,else planning watching tomorrow


In [10]:
trainData['Polarity'] = trainData['Polarity'].apply(lambda x: x.capitalize())

trainData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,time to eat with my best buddy! #lunch,Happy,time with best buddy lunch
1,1,@user @user if they want reflection money. #ksleg,Happy,they want reflection money ksleg
2,2,---Good job but I’ will expect a lot more in f...,Happy,Good will expect more future
3,3,totally dissatisfied with the service###%%@@ n...,Sad,totally dissatisfied with service never used t...
4,4,loved my work!!!!!,Happy,loved work
5,5,Worst customer care service......@@$$$angry,Sad,Worst customer care service angry
6,6,Brilliant effort guys!!!,Happy,Brilliant effort guys
7,7,@user @user you point one finger @user million...,Sad,point finger millions pointed right back jewis...
8,8,"words r free, it's how u use that can cost you...",Happy,words free that cost verbal abuse love adult teen
9,9,you might be a libtard if... #libtard #sjw #li...,Sad,might libtard libtard liberal politics


In [11]:
testData['Polarity'] = testData['Polarity'].apply(lambda x: x.capitalize())

testData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,@use the pic says otherwise for young girls co...,Sad,says otherwise young girls confined that kitch...
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,Happy,good night faith ever vaitacacommafiasdv
2,2,@user when you're blocked by a troll because y...,Sad,when blocked troll because promise blacklivesm...
3,3,dinner with sister!!,Happy,dinner with sister
4,4,who else is planning on watching @user tomorrow?,Happy,else planning watching tomorrow


### Label Encoding for Train/Test Data

In [12]:
def labelEncoder(polarity):
    if(polarity == 'Happy'):
        return 1
    return 0

In [13]:
trainData['Polarity'] = trainData['Polarity'].apply(lambda x: labelEncoder(x))

trainData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,time to eat with my best buddy! #lunch,1,time with best buddy lunch
1,1,@user @user if they want reflection money. #ksleg,1,they want reflection money ksleg
2,2,---Good job but I’ will expect a lot more in f...,1,Good will expect more future
3,3,totally dissatisfied with the service###%%@@ n...,0,totally dissatisfied with service never used t...
4,4,loved my work!!!!!,1,loved work
5,5,Worst customer care service......@@$$$angry,0,Worst customer care service angry
6,6,Brilliant effort guys!!!,1,Brilliant effort guys
7,7,@user @user you point one finger @user million...,0,point finger millions pointed right back jewis...
8,8,"words r free, it's how u use that can cost you...",1,words free that cost verbal abuse love adult teen
9,9,you might be a libtard if... #libtard #sjw #li...,0,might libtard libtard liberal politics


In [14]:
testData['Polarity'] = testData['Polarity'].apply(lambda x: labelEncoder(x))

testData

Unnamed: 0,Index,Comment,Polarity,Processed_Comment
0,0,@use the pic says otherwise for young girls co...,0,says otherwise young girls confined that kitch...
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,1,good night faith ever vaitacacommafiasdv
2,2,@user when you're blocked by a troll because y...,0,when blocked troll because promise blacklivesm...
3,3,dinner with sister!!,1,dinner with sister
4,4,who else is planning on watching @user tomorrow?,1,else planning watching tomorrow


### Tokanizing Comments of train data

In [15]:
tokenized_trainComment = trainData['Processed_Comment'].apply(lambda x: x.split())

tokenized_trainComment

0                     [time, with, best, buddy, lunch]
1               [they, want, reflection, money, ksleg]
2                   [Good, will, expect, more, future]
3    [totally, dissatisfied, with, service, never, ...
4                                        [loved, work]
5              [Worst, customer, care, service, angry]
6                            [Brilliant, effort, guys]
7    [point, finger, millions, pointed, right, back...
8    [words, free, that, cost, verbal, abuse, love,...
9         [might, libtard, libtard, liberal, politics]
Name: Processed_Comment, dtype: object

### POS Tagging

In [46]:
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
trainDataList = trainData['Processed_Comment'].tolist()
taggedList = list()
posList = list()

for sentence in trainDataList:
        tokenized = nltk.word_tokenize(sentence)
        taggedList.append(nltk.pos_tag(tokenized))
  
#removing repititions
for tList in taggedList:
    for word_tuple in tList:
        if word_tuple not in posList:
            posList.append(word_tuple)
        
posList

[('time', 'NN'),
 ('with', 'IN'),
 ('best', 'JJS'),
 ('buddy', 'NN'),
 ('lunch', 'NN'),
 ('they', 'PRP'),
 ('want', 'VBP'),
 ('reflection', 'NN'),
 ('money', 'NN'),
 ('ksleg', 'NN'),
 ('Good', 'NNP'),
 ('will', 'MD'),
 ('expect', 'VB'),
 ('more', 'JJR'),
 ('future', 'JJ'),
 ('totally', 'RB'),
 ('dissatisfied', 'JJ'),
 ('service', 'NN'),
 ('never', 'RB'),
 ('used', 'VBD'),
 ('this', 'DT'),
 ('again', 'RB'),
 ('loved', 'VBN'),
 ('work', 'NN'),
 ('Worst', 'NNP'),
 ('customer', 'NN'),
 ('care', 'NN'),
 ('angry', 'JJ'),
 ('Brilliant', 'JJ'),
 ('effort', 'NN'),
 ('guys', 'NNS'),
 ('point', 'NN'),
 ('finger', 'NN'),
 ('millions', 'NNS'),
 ('pointed', 'VBD'),
 ('right', 'JJ'),
 ('back', 'RB'),
 ('jewishsupremacist', 'NN'),
 ('words', 'NNS'),
 ('free', 'VBP'),
 ('that', 'IN'),
 ('cost', 'NN'),
 ('verbal', 'JJ'),
 ('abuse', 'NN'),
 ('love', 'NN'),
 ('adult', 'NN'),
 ('teen', 'NN'),
 ('might', 'MD'),
 ('libtard', 'RB'),
 ('libtard', 'VB'),
 ('liberal', 'JJ'),
 ('politics', 'NNS')]

#### Removing additional letters such as ed, 's etc.

In [None]:
ps = PorterStemmer()

tokenized_trainComment = tokenized_trainComment.apply(lambda x: [ps.stem(i) for i in x])

tokenized_trainComment

#### Replacing old Processed comments

In [None]:
for i in range(len(tokenized_trainComment)):
    tokenized_trainComment[i] = ' '.join(tokenized_trainComment[i])

trainData['Processed_Comment'] = tokenized_trainComment

trainData

### Tokanizing Comments of Test Data

In [38]:
tokenized_testComment = testData["Processed_Comment"].apply(lambda x: x.split())

tokenized_testComment

0    [says, otherwise, young, girls, confined, that...
1       [good, night, faith, ever, vaitacacommafiasdv]
2    [when, blocked, troll, because, promise, black...
3                               [dinner, with, sister]
4                 [else, planning, watching, tomorrow]
Name: Processed_Comment, dtype: object

### POS Tagging

In [45]:
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
testDataList = testData['Processed_Comment'].tolist()
taggedList = list()
posList = list()

for sentence in testDataList:
        tokenized = nltk.word_tokenize(sentence)
        taggedList.append(nltk.pos_tag(tokenized))
  
#removing repititions
for tList in taggedList:
    for word_tuple in tList:
        if word_tuple not in posList:
            posList.append(word_tuple)
        
posList

[('says', 'VBZ'),
 ('otherwise', 'RB'),
 ('young', 'JJ'),
 ('girls', 'NNS'),
 ('confined', 'VBD'),
 ('that', 'IN'),
 ('kitchen', 'NN'),
 ('void', 'NN'),
 ('meaning', 'VBG'),
 ('beyond', 'IN'),
 ('cheap', 'JJ'),
 ('publicity', 'NN'),
 ('topoli', 'NN'),
 ('good', 'JJ'),
 ('night', 'NN'),
 ('faith', 'NN'),
 ('ever', 'RB'),
 ('vaitacacommafiasdv', 'VBD'),
 ('when', 'WRB'),
 ('blocked', 'VBN'),
 ('troll', 'NN'),
 ('because', 'IN'),
 ('promise', 'NN'),
 ('blacklivesmatter', 'NN'),
 ('nonsensical', 'JJ'),
 ('rants', 'NNS'),
 ('dinner', 'NN'),
 ('with', 'IN'),
 ('sister', 'NN'),
 ('else', 'RB'),
 ('planning', 'VBG'),
 ('watching', 'VBG'),
 ('tomorrow', 'NN')]

#### Removing additional letters such as ed, 's etc.

In [None]:
ps = PorterStemmer()

tokenized_testComment = tokenized_testComment.apply(lambda x : [ps.stem(i) for i in x])


tokenized_testComment

#### Replacing old Processed comments


In [None]:
for i in range(len(tokenized_testComment)):
    tokenized_testComment[i] = ' '.join(tokenized_testComment[i])
    
testData['Processed_Comment'] = tokenized_testComment
    
testData

## Feature Extraction from Train Data

### Bag of Words

In [None]:
cv= CountVectorizer()

bag_of_words_train = cv.fit_transform(trainData['Processed_Comment']).toarray()

print(cv.vocabulary_)

print(cv.get_feature_names())
print('\n')
print(bag_of_words_train)


### Word Count

In [None]:
wordCount = 0

for i in range(len(bag_of_words_train)):
    wordCount = wordCount+i
    
wordCount


### TF-IDF

In [None]:
tfidf = TfidfVectorizer()

tfidf_matrix = tfidf.fit_transform(trainData['Processed_Comment'])

trainData_tfidf = pd.DataFrame(tfidf_matrix.todense())

display(trainData_tfidf)

### N-Gram

In [59]:
trainDataList = trainData['Processed_Comment'].tolist()
grams = list()
n = 3

for sentence in trainDataList:
    threeGrams = ngrams(sentence.split(), n)
    for gram in threeGrams:
      grams.append(gram)

grams


[('time', 'with', 'best'),
 ('with', 'best', 'buddy'),
 ('best', 'buddy', 'lunch'),
 ('they', 'want', 'reflection'),
 ('want', 'reflection', 'money'),
 ('reflection', 'money', 'ksleg'),
 ('Good', 'will', 'expect'),
 ('will', 'expect', 'more'),
 ('expect', 'more', 'future'),
 ('totally', 'dissatisfied', 'with'),
 ('dissatisfied', 'with', 'service'),
 ('with', 'service', 'never'),
 ('service', 'never', 'used'),
 ('never', 'used', 'this'),
 ('used', 'this', 'service'),
 ('this', 'service', 'again'),
 ('Worst', 'customer', 'care'),
 ('customer', 'care', 'service'),
 ('care', 'service', 'angry'),
 ('Brilliant', 'effort', 'guys'),
 ('point', 'finger', 'millions'),
 ('finger', 'millions', 'pointed'),
 ('millions', 'pointed', 'right'),
 ('pointed', 'right', 'back'),
 ('right', 'back', 'jewishsupremacist'),
 ('words', 'free', 'that'),
 ('free', 'that', 'cost'),
 ('that', 'cost', 'verbal'),
 ('cost', 'verbal', 'abuse'),
 ('verbal', 'abuse', 'love'),
 ('abuse', 'love', 'adult'),
 ('love', 'adult'

## Feature Extraction for Test Data

### Bag of Words

In [None]:
cv= CountVectorizer()

bag_of_words_test = cv.fit_transform(testData['Processed_Comment']).toarray()

print(cv.vocabulary_)

print(cv.get_feature_names())
print('\n')
print(bag_of_words_test)

### Word Count

In [None]:
wordCount = 0

for i in range(len(bag_of_words_test)):
    wordCount = wordCount+i
    
wordCount


### TF-IDF

In [None]:
tfidf = TfidfVectorizer()

tfidf_matrix = tfidf.fit_transform(testData['Processed_Comment'])

testData_tfidf = pd.DataFrame(tfidf_matrix.todense())

display(testData_tfidf)

### N-Gram

In [62]:
testDataList = testData['Processed_Comment'].tolist()
grams = list()
n = 3

for sentence in testDataList:
    threeGrams = ngrams(sentence.split(), n)
    for gram in threeGrams:
      grams.append(gram)

grams


[('says', 'otherwise', 'young'),
 ('otherwise', 'young', 'girls'),
 ('young', 'girls', 'confined'),
 ('girls', 'confined', 'that'),
 ('confined', 'that', 'kitchen'),
 ('that', 'kitchen', 'void'),
 ('kitchen', 'void', 'meaning'),
 ('void', 'meaning', 'beyond'),
 ('meaning', 'beyond', 'cheap'),
 ('beyond', 'cheap', 'publicity'),
 ('cheap', 'publicity', 'topoli'),
 ('good', 'night', 'faith'),
 ('night', 'faith', 'ever'),
 ('faith', 'ever', 'vaitacacommafiasdv'),
 ('when', 'blocked', 'troll'),
 ('blocked', 'troll', 'because'),
 ('troll', 'because', 'promise'),
 ('because', 'promise', 'blacklivesmatter'),
 ('promise', 'blacklivesmatter', 'nonsensical'),
 ('blacklivesmatter', 'nonsensical', 'rants'),
 ('dinner', 'with', 'sister'),
 ('else', 'planning', 'watching'),
 ('planning', 'watching', 'tomorrow')]