Importing Libraries

In [1]:
import pandas as pd
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

Reading Training Dataset

In [2]:
trainData = pd.read_csv("TrainData.csv","_")

trainData

Unnamed: 0,Index,Comment,Polarity
0,0,time to eat with my best buddy! #lunch,Happy
1,1,@user @user if they want reflection money. #ksleg,Happy
2,2,---Good job but I’ will expect a lot more in f...,Happy
3,3,totally dissatisfied with the service###%%@@ n...,Sad
4,4,loved my work!!!!!,Happy
5,5,Worst customer care service......@@$$$angry,Sad
6,6,Brilliant effort guys!!!,Happy
7,7,@user @user you point one finger @user million...,Sad
8,8,"words r free, it's how u use that can cost you...",Happy
9,9,you might be a libtard if... #libtard #sjw #li...,Sad


Reading Test Dataset

In [3]:
testData = pd.read_csv("TestData.csv","_")

testData

Unnamed: 0,Index,Comment,Polarity
0,0,@use the pic says otherwise for young girls co...,Sad
1,1,#good night! ?? #faith ever #vaitacacommafiasdv,Happy
2,2,@user when you're blocked by a troll because y...,Sad
3,3,dinner with sister!!,Happy
4,4,who else is planning on watching @user tomorrow?,happy


Combining Both Train and Test Datasets

In [4]:
data = trainData.append(testData,ignore_index=True,sort=True)

data

Unnamed: 0,Comment,Index,Polarity
0,time to eat with my best buddy! #lunch,0,Happy
1,@user @user if they want reflection money. #ksleg,1,Happy
2,---Good job but I’ will expect a lot more in f...,2,Happy
3,totally dissatisfied with the service###%%@@ n...,3,Sad
4,loved my work!!!!!,4,Happy
5,Worst customer care service......@@$$$angry,5,Sad
6,Brilliant effort guys!!!,6,Happy
7,@user @user you point one finger @user million...,7,Sad
8,"words r free, it's how u use that can cost you...",8,Happy
9,you might be a libtard if... #libtard #sjw #li...,9,Sad


In [5]:
#Removing Handles @user in the data

In [6]:
def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

data['Processed_Comment'] = np.vectorize(remove_pattern)(data['Comment'], "@[\w]*")

data

Unnamed: 0,Comment,Index,Polarity,Processed_Comment
0,time to eat with my best buddy! #lunch,0,Happy,time to eat with my best buddy! #lunch
1,@user @user if they want reflection money. #ksleg,1,Happy,if they want reflection money. #ksleg
2,---Good job but I’ will expect a lot more in f...,2,Happy,---Good job but I’ will expect a lot more in f...
3,totally dissatisfied with the service###%%@@ n...,3,Sad,totally dissatisfied with the service###%% nev...
4,loved my work!!!!!,4,Happy,loved my work!!!!!
5,Worst customer care service......@@$$$angry,5,Sad,Worst customer care service......$$$angry
6,Brilliant effort guys!!!,6,Happy,Brilliant effort guys!!!
7,@user @user you point one finger @user million...,7,Sad,you point one finger millions are pointed r...
8,"words r free, it's how u use that can cost you...",8,Happy,"words r free, it's how u use that can cost you..."
9,you might be a libtard if... #libtard #sjw #li...,9,Sad,you might be a libtard if... #libtard #sjw #li...


In [8]:
# Removing everything except text

data['Processed_Comment'] = data['Processed_Comment'].str.replace("[^a-zA-Z]", " ")

data

  data['Processed_Comment'] = data['Processed_Comment'].str.replace("[^a-zA-Z]", " ")


Unnamed: 0,Comment,Index,Polarity,Processed_Comment
0,time to eat with my best buddy! #lunch,0,Happy,time to eat with my best buddy lunch
1,@user @user if they want reflection money. #ksleg,1,Happy,if they want reflection money ksleg
2,---Good job but I’ will expect a lot more in f...,2,Happy,Good job but I will expect a lot more in f...
3,totally dissatisfied with the service###%%@@ n...,3,Sad,totally dissatisfied with the service nev...
4,loved my work!!!!!,4,Happy,loved my work
5,Worst customer care service......@@$$$angry,5,Sad,Worst customer care service angry
6,Brilliant effort guys!!!,6,Happy,Brilliant effort guys
7,@user @user you point one finger @user million...,7,Sad,you point one finger millions are pointed r...
8,"words r free, it's how u use that can cost you...",8,Happy,words r free it s how u use that can cost you...
9,you might be a libtard if... #libtard #sjw #li...,9,Sad,you might be a libtard if libtard sjw li...


Removing Short Words


In [9]:
data['Processed_Comment'] = data['Processed_Comment'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

data

Unnamed: 0,Comment,Index,Polarity,Processed_Comment
0,time to eat with my best buddy! #lunch,0,Happy,time with best buddy lunch
1,@user @user if they want reflection money. #ksleg,1,Happy,they want reflection money ksleg
2,---Good job but I’ will expect a lot more in f...,2,Happy,Good will expect more future
3,totally dissatisfied with the service###%%@@ n...,3,Sad,totally dissatisfied with service never used t...
4,loved my work!!!!!,4,Happy,loved work
5,Worst customer care service......@@$$$angry,5,Sad,Worst customer care service angry
6,Brilliant effort guys!!!,6,Happy,Brilliant effort guys
7,@user @user you point one finger @user million...,7,Sad,point finger millions pointed right back jewis...
8,"words r free, it's how u use that can cost you...",8,Happy,words free that cost verbal abuse love adult teen
9,you might be a libtard if... #libtard #sjw #li...,9,Sad,might libtard libtard liberal politics


In [11]:
tokenized_tweet = data['Processed_Comment'].apply(lambda x: x.split())
tokenized_tweet

0                     [time, with, best, buddy, lunch]
1               [they, want, reflection, money, ksleg]
2                   [Good, will, expect, more, future]
3    [totally, dissatisfied, with, service, never, ...
4                                        [loved, work]
Name: Processed_Comment, dtype: object