In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
#import seaborn as sns
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.classify import NaiveBayesClassifier

In [2]:
data = pd.read_table('SemEval2017-task4-dev.subtask-BD.english.INPUT.txt', index_col=False, header =0, sep='\t', names=['ID','topic','label','tweet'])
print(data.dtypes)
print(data.head(10))

ID        int64
topic    object
label    object
tweet    object
dtype: object
                   ID        topic     label  \
0  675847244747177984  amy schumer  negative   
1  672827854279843840  amy schumer  negative   
2  662755012129529858  amy schumer  negative   
3  671502639671042048  amy schumer  negative   
4  677359143108214784  amy schumer  negative   
5  663714752162607104  amy schumer  negative   
6  671468325617033216  amy schumer  negative   
7  665033491445383168  amy schumer  negative   
8  678882295349190656  amy schumer  negative   
9  672070053509079040  amy schumer  negative   

                                               tweet  
0  @dani_pitter I mean I get the hype around JLaw...  
1  Amy Schumer at the #GQmenoftheyear2015 party i...  
2  Amy Schumer is on Sky Atlantic doing one of th...  
3  Amy Schumer may have brought us Trainwreck, bu...  
4  I just think that sports are stupid &amp;anyon...  
5  If you do like Amy Schumer, I hope you went &a...  
6  Peopl

In [3]:
def remove_pattern(input_txt, pattern):
#    print(pattern, input_txt)
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
  
    return input_txt

In [4]:
#Removing Twitter handle
data['tidy_tweet'] = np.vectorize(remove_pattern)(data['tweet'], "@[\w]*")
#Removing Punctuations, Numbers, and Special Characters
data['tidy_tweet'] = data['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")
#Removing short words of 3 letters and less
data['tidy_tweet'] = data['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
#Tokenizing - TO DO: you can also use NLTK library for this
tokenized_tweet = data['tidy_tweet'].apply(lambda x: x.split())

In [5]:
#Stemming 
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) 

In [6]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
data['tidy_tweet'] = tokenized_tweet

In [7]:
#Create bag-of-words feature
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(data['tidy_tweet'])
print(bow)

  (0, 558)	1
  (0, 428)	3
  (0, 516)	1
  (0, 475)	1
  (0, 921)	1
  (0, 759)	1
  (1, 759)	1
  (1, 641)	1
  (1, 254)	1
  (1, 680)	1
  (1, 391)	1
  (1, 423)	2
  (2, 759)	1
  (2, 979)	1
  (2, 829)	1
  (2, 780)	1
  (2, 769)	1
  (2, 767)	1
  (3, 475)	1
  (3, 759)	1
  (3, 115)	1
  (3, 974)	1
  (4, 516)	1
  (4, 475)	1
  (4, 759)	1
  :	:
  (10546, 235)	1
  (10546, 356)	1
  (10546, 301)	1
  (10546, 561)	1
  (10546, 534)	1
  (10546, 388)	1
  (10546, 999)	1
  (10547, 423)	1
  (10547, 383)	1
  (10547, 193)	1
  (10547, 183)	1
  (10547, 535)	1
  (10547, 999)	1
  (10548, 892)	1
  (10548, 949)	1
  (10548, 67)	1
  (10548, 386)	1
  (10548, 398)	1
  (10548, 112)	1
  (10548, 999)	1
  (10549, 423)	1
  (10549, 374)	1
  (10549, 609)	1
  (10549, 939)	1
  (10549, 999)	1


In [45]:
#from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

In [9]:
train_bow = bow[:20632,:]
test_bow = bow[20632:,:]

In [42]:
data['n_label'] = data['label'].str.replace("positive", "1")
data['n_label'] = data['n_label'].str.replace("negative", "0")
print(data['n_label'])

0        0
1        0
2        0
3        0
4        0
        ..
10545    1
10546    1
10547    1
10548    1
10549    1
Name: n_label, Length: 10550, dtype: object


In [38]:
# splitting data into training and validation set (70:30)
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, data['n_label'], random_state=42, test_size=0.3)
# Train a Naive Bayes Multinomial classifier
classifier = MultinomialNB()
classifier.fit(xtrain_bow, ytrain)
# making predictions on the testing set 
y_pred = classifier.predict(xvalid_bow) 

In [44]:
from sklearn import metrics 
print("Multinomial Naive Bayes model accuracy(in %):", metrics.accuracy_score(yvalid, y_pred)*100)
print("Multinomial Naive Bayes model precision(in %):", metrics.precision_score(yvalid, y_pred, pos_label='1')*100)
print("Multinomial Naive Bayes model F1 score (in %):", metrics.f1_score(yvalid, y_pred, pos_label='1')*100)

Multinomial Naive Bayes model accuracy(in %): 85.65560821484992
Multinomial Naive Bayes model precision(in %): 89.93125758188435
Multinomial Naive Bayes model F1 score (in %): 90.73847409220727


In [11]:
print(y_pred)

['positive' 'positive' 'positive' ... 'positive' 'positive' 'positive']


In [14]:
from collections import Counter

In [29]:
#Classify and count tweet quantifier method
count = (Counter(y_pred))
#print(count)
#pos counts "positive" labels in the y_pred array
pos = count['positive']
#neg counts "negative" labels in the y_pred array
neg = count['negative']
#print(pos)
cc = pos/(pos + neg)
print(cc)    
print("The number of positives based on CC approach is "+ str(cc) + ".") 

Counter({'positive': 2473, 'negative': 692})
2473
0.7813586097946288
The number of positives based on CC approach is 0.7813586097946288.


In [60]:
#compare yvalid and y_pred to establish the share of true positives and false positives
#calculate confusion matrix
confusion_matrix(yvalid, y_pred)
tn, fp, fn, tp = confusion_matrix(yvalid, y_pred).ravel()
(tn, fp, fn, tp)
#calculate tpr and fpr by using the above elements of the confusion matrix
tpr = tp/(tp + fn)
fpr = fp/(fp + tn)
print(tpr, fpr)

0.915603128859613 0.3383152173913043


In [61]:
#Quantify using Adjusted Count method
ac = (cc - fpr)/(tpr - fpr)
print(str(ac))

0.7674565560821486
