In [2]:
import pandas as pd

In [27]:
dataset = pd.read_csv('train.tsv', delimiter = '\t')

In [28]:
dataset

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [29]:
# Polarity 1 for positive, 0 for negative

def Polarity (row):
    if row['Sentiment'] == 4 or row['Sentiment'] == 3:
        return 'positive'
    elif row['Sentiment'] == 0 or row['Sentiment'] == 1:
        return 'negative'
    elif row['Sentiment'] == 2:
        return 'neutral'

In [33]:
dataset_copy = dataset.copy()

# Polarity for every row
dataset_copy.apply (lambda row: Polarity (row),axis=1)

0         negative
1          neutral
2          neutral
3          neutral
4          neutral
5          neutral
6          neutral
7          neutral
8          neutral
9          neutral
10         neutral
11         neutral
12         neutral
13         neutral
14         neutral
15         neutral
16         neutral
17         neutral
18         neutral
19         neutral
20         neutral
21        positive
22        positive
23         neutral
24         neutral
25         neutral
26         neutral
27         neutral
28         neutral
29         neutral
            ...   
156030     neutral
156031    negative
156032    negative
156033    negative
156034    negative
156035     neutral
156036    negative
156037     neutral
156038     neutral
156039     neutral
156040     neutral
156041     neutral
156042     neutral
156043    positive
156044     neutral
156045     neutral
156046     neutral
156047    negative
156048     neutral
156049     neutral
156050     neutral
156051    ne

In [34]:
# Creating a new column and appending it to the existing dataframe

dataset_copy['Polarity'] = dataset.apply (lambda row: Polarity (row),axis=1)
dataset_copy

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Polarity
0,1,1,A series of escapades demonstrating the adage ...,1,negative
1,2,1,A series of escapades demonstrating the adage ...,2,neutral
2,3,1,A series,2,neutral
3,4,1,A,2,neutral
4,5,1,series,2,neutral
5,6,1,of escapades demonstrating the adage that what...,2,neutral
6,7,1,of,2,neutral
7,8,1,escapades demonstrating the adage that what is...,2,neutral
8,9,1,escapades,2,neutral
9,10,1,demonstrating the adage that what is good for ...,2,neutral


In [44]:
dataset_copy.drop(columns = ["PhraseId", "SentenceId"], inplace = True)

In [45]:
dataset_copy

Unnamed: 0,Phrase,Sentiment,Polarity
0,A series of escapades demonstrating the adage ...,1,negative
1,A series of escapades demonstrating the adage ...,2,neutral
2,A series,2,neutral
3,A,2,neutral
4,series,2,neutral
5,of escapades demonstrating the adage that what...,2,neutral
6,of,2,neutral
7,escapades demonstrating the adage that what is...,2,neutral
8,escapades,2,neutral
9,demonstrating the adage that what is good for ...,2,neutral


In [35]:

# Importing natural language tool kit

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

In [36]:

# Importing word cloud and matplotlib

from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

In [46]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into train and test set
train, test = train_test_split(dataset_copy,test_size = 0.2)

In [57]:

# Separated Postive and negative reviews of the training set

train_pos = train[ train['Polarity'] == 'positive']
train_pos = train_pos['Phrase']

train_neg = train[ train['Polarity'] == 'negative']
train_neg = train_neg['Phrase']

train_neu = train[ train['Polarity'] == 'neutral']
train_neu = train_neu['Phrase']

In [86]:
print(len(train_pos))

print(len(train_neg))

print(len(train_neu))

print(len(test_pos))

print(len(test_neg))

print(len(test_neu))

33736
27486
63626
8397
6859
15956


In [87]:
# def clean_sentences(dataset_copy):
#     reviews = []

#     for sent in tqdm(dataset_copy['Phrase']):
        
#         #remove html content
#         review_text = BeautifulSoup(sent).get_text()
        
#         #remove non-alphabetic characters
#         review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
#         #tokenize the sentences
#         words = word_tokenize(review_text.lower())
    
#         #lemmatize each word to its lemma
#         lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
#         reviews.append(lemma_words)

#     return(reviews)





# Used Stopwords to remove the words of less importance

texts = []
stopwords_set = set(stopwords.words("english"))

for index, row in train.iterrows():
    words_filtered = [e.lower() for e in row.Phrase.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')]     
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
    texts.append((words_without_stopwords, row.Polarity))

test_pos = test[ test['Polarity'] == 'positive']
test_pos = test_pos['Phrase']
test_neg = test[ test['Polarity'] == 'negative']
test_neg = test_neg['Phrase']
test_neu = test[ test['Polarity'] == 'neutral']
test_neu = test_neu['Phrase']

In [89]:
# Extracting word features
def get_words_in_texts(texts):
    all = []
    for (words, Polarity) in texts:
        all.extend(words)
    return all

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    features = wordlist.keys()
    return features
w_features = get_word_features(get_words_in_texts(texts))

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in w_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [90]:
# Training the Naive Bayes classifier
training_set = nltk.classify.apply_features(extract_features,texts)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [91]:
neg_cnt = 0
pos_cnt = 0
neu_cnt = 0
for obj in test_neg: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 'negative'): 
        neg_cnt = neg_cnt + 1
for obj in test_pos: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 'positive'): 
        pos_cnt = pos_cnt + 1
for obj in test_neu: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 'neutral'): 
        neu_cnt = neu_cnt + 1

        
print('[negative]: %s/%s '  % (len(test_neg),neg_cnt))        
print('[positive]: %s/%s '  % (len(test_pos),pos_cnt)) 
print('[neutral]: %s/%s '  % (len(test_neu),neu_cnt))

[negative]: 6859/3589 
[positive]: 8397/4929 
[neutral]: 15956/13330 


In [96]:

# Model Accuracy on how well the model predicted the negative reviews

negative_percent = 3589/6859 * 100
print(negative_percent)

52.32541186761919


In [97]:

# Model Accuracy on how well the model predicted the positive reviews

positive_percent = 4929/8397 * 100
print(positive_percent)

58.69953554841014


In [98]:

# Model Accuracy on how well the model predicted the neutral reviews

neutral_percent = 13330/15956 * 100
print(neutral_percent)

83.5422411631988


In [100]:

# Total accuracy on how well the model predicted both the positive and negative reviews

total = (3589 + 4929 + 13330) / (6859 + 8397 + 15956) * 100
print(total)

69.99871844162502
