# Imports

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import numpy as np
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
import preprocessing
import features
from sklearn import tree
from sklearn.metrics import precision_recall_fscore_support

# Files reading

In [2]:
tw2013tr=pd.read_csv('twitter-2013train-A.tsv',sep='\t',header=None)
tw2015tr=pd.read_csv('twitter-2015train-A.tsv',sep='\t',header=None)
tw2016tr=pd.read_csv('twitter-2016train-A.tsv',sep='\t',header=None)
twAtest=pd.read_csv('twitter-2016test-A.tsv',sep='\t',header=None)

twdev=pd.read_csv('twitter-2016dev-A.tsv',sep='\t',header=None)
twdevtest=pd.read_csv('twitter-2016devtest-A.tsv',sep='\t',header=None)


# Train, Test
Datasets concatenation and column name addition

In [3]:
all_train=tw2013tr.append(tw2015tr).append(tw2016tr)#.append(twdev)
twtest=twAtest.append(twdevtest)


In [4]:
all_train.columns=['id','label','tweet']
twtest.columns=['id','label','tweet']

twdevtest.columns=['id','label','tweet']
twdev.columns=['id','label','tweet']

# Removal of Unavailable tweets
and reindexing of dataframes

In [5]:
all_train = all_train[all_train.tweet != "Not Available"]
twtest = twtest[twtest.tweet != "Not Available"]

twdevtest = twdevtest[twdevtest.tweet != "Not Available"]
twdev = twdev[twdev.tweet != "Not Available"]


In [6]:
all_train = all_train.reset_index(drop=True)
twtest = twtest.reset_index(drop=True)

twdevtest = twdevtest.reset_index(drop=True)
twdev = twdev.reset_index(drop=True)



# Html entities decode to pure text

In [7]:
from HTMLParser import HTMLParser
h = HTMLParser()
all_train['tweet']=all_train.tweet.apply(h.unescape)
twtest['tweet']=twtest.tweet.apply(h.unescape)



# Stop words removal (deceided not do apply it to the data)

In [8]:
#from nltk.corpus import stopwords

#cachedStopWords = stopwords.words("english")
#def removestopwords(text):
#    text = ' '.join([word for word in text.split() if word not in cachedStopWords])
#    return text

#for i in range(all_train.tweet.count()):
#    all_train.iloc[i,all_train.columns.get_loc('tweet')] = removestopwords(all_train.tweet.iloc[i])

#for i in range(twtest.tweet.count()):
#    twtest.iloc[i,twtest.columns.get_loc('tweet')] = removestopwords(twtest.tweet.iloc[i])



# Preprocessing steps for tweets 
and some feature extraction for our other method

In [9]:

for i in range(all_train.tweet.count()):
    all_train.iloc[i,all_train.columns.get_loc('tweet')] = preprocessing.processAll(all_train.tweet.iloc[i])

all_train['features']='0'
temp_list=[]
for i in range(all_train.tweet.count()):
    temp_list.append(features.tweet_features.make_tweet_nparr(all_train.tweet.iloc[i]))
all_train['features']=temp_list



for i in range(twtest.tweet.count()):
    twtest.iloc[i,twtest.columns.get_loc('tweet')] = preprocessing.processAll(twtest.tweet.iloc[i])

twtest['features']='0'
temp_list=[]
for i in range(twtest.tweet.count()):
    temp_list.append(features.tweet_features.make_tweet_nparr(twtest.tweet.iloc[i]))
twtest['features']=temp_list


In [10]:
x=pd.DataFrame(data=all_train.features.iloc[0].reshape(-1, len(all_train.features.iloc[0])))
for i in range(1,all_train.tweet.count()):
    x=x.append(pd.DataFrame(all_train.features.iloc[i].reshape(-1, len(all_train.features.iloc[i]))))
y=pd.DataFrame(data=twtest.features.iloc[0].reshape(-1, len(twtest.features.iloc[0])))
for i in range(1,twtest.tweet.count()):
    y=y.append(pd.DataFrame(twtest.features.iloc[i].reshape(-1, len(twtest.features.iloc[i]))))

# Pipeline for Multinomial Naive Bayes Classifier 
with 1,2,3 ngrams, TfIdf and text length as features

In [11]:
def get_text_length(x):
    return np.array([len(t) for t in x]).reshape(-1, 1)
pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text', Pipeline([
                ('vectorizer', CountVectorizer(min_df=0.002,ngram_range=(1, 3))),
                ('tfidf', TfidfTransformer()),
            ])),
            ('length', Pipeline([
                ('count', FunctionTransformer(get_text_length, validate=False)),
            ]))
        ])),
        ('classifier', MultinomialNB())
    ])



# Model fit and predection

## 1st Method
Multinomial Naive Bayes with 1,2,3 ngrams, TfIdf and text length

In [12]:
pipeline.fit(all_train.tweet, all_train.label)
predicted = pipeline.predict(twtest.tweet)
np.mean(predicted == twtest.label)

0.59542206471852666

In [13]:

precision_recall_fscore_support(twtest.label, predicted, average='weighted')

(0.57493103948091095, 0.59542206471852666, 0.55886148929366508, None)

## 2nd Method
Multinomial Naive Bayes with vector of complex features (explained in the readme)

In [14]:
clf=MultinomialNB()
clf = clf.fit(x, all_train.label)

predicted = clf.predict(y)
np.mean(predicted == twtest.label)

0.57032475454598264

In [15]:
precision_recall_fscore_support(twtest.label, predicted, average='weighted')

(0.61017881240713601, 0.57032475454598264, 0.50908300927969385, None)

## 3rd Method
Use of the pretrained tool TextBlob (with a mapping of its polarities to our labels)

In [16]:
from textblob import TextBlob
polarities=[]
for i in range(twtest.tweet.count()):
    testimonial = TextBlob(twtest.tweet.iloc[i])
    polarities.append(testimonial.sentiment.polarity)

    marika=[]
for i in range(twtest.tweet.count()):
    if polarities[i]>0.3:
        marika.append('positive')
    elif polarities[i]< -0.3:
        marika.append('negative')
    else:
        marika.append('neutral')
        
np.mean(marika == twtest.label)
    

0.54999128565619004

In [17]:
precision_recall_fscore_support(twtest.label, marika, average='weighted')

(0.5481127812272778, 0.54999128565619004, 0.51443250960099485, None)

## 4th Method
Use of the pretrained tool Pattern (with a mapping of its polarities to our labels)

In [18]:
from pattern.en  import sentiment as sentiment_en
    
polarities=[]
for i in range(twtest.tweet.count()):
    testimonial = sentiment_en(twtest.tweet.iloc[i])
    polarities.append(testimonial[0])
marika=[]
for i in range(twtest.tweet.count()):
    if polarities[i]>0.3:
        marika.append('positive')
    elif polarities[i]< -0.3:
        marika.append('negative')
    else:
        marika.append('neutral')
        
np.mean(marika == twtest.label)


0.54923604252599778

In [19]:
precision_recall_fscore_support(twtest.label, marika, average='weighted')

(0.54619414967032565, 0.54923604252599778, 0.51340252319815693, None)