# Sarcasm classification: BernoulliNB

Import all needed libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn import cross_validation
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.base import TransformerMixin
from sklearn.metrics import recall_score, precision_score, confusion_matrix, f1_score, precision_recall_curve,average_precision_score

Set properties for reading dataset.

In [2]:
date_file = "data/tweets_all.csv"
names=['tweetID','tweet', 'target','pos','neu','neg','caps']

In [3]:
df = pd.read_csv(date_file, delimiter='|', encoding="utf-8", quotechar='"', header=None, names=names)

Remove and convert objects to numeric values and drop NA's (only around 10 are being dropped).

In [4]:
df = df.dropna()
df["target"] = df["target"].convert_objects(convert_numeric=True)

df = df.dropna()
df["target"] = df["target"].astype(int)

df.head()

Unnamed: 0,tweetID,tweet,target,pos,neu,neg,caps
0,500370339566985216,Zo fijn die #PostNL beloofd eerst tm vanmorgen...,1,0.091,0.909,0.0,0
1,411900187649794048,Heel fijn Vast op Dordrecht geen treinverkeer ...,1,0.114,0.762,0.124,0
2,175627549802639361,echt geweldig blyk denk dat ik dan maar weer v...,1,0.186,0.814,0.0,0
3,321699315707936769,fijn een product hebben dat niet werkt #kpn,1,0.184,0.816,0.0,0
4,258575794601865217,lekker chatten met een medewerker van tmobile,1,0.327,0.673,0.0,0


In [5]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(df[['tweet','pos','neu','neg','caps']]
, df['target'], test_size=0.25)

Transformer and extractor classes, which are needed to extract data out of the df and transform into the right shape so they are ready to be used by the feature union (FU).

In [6]:
class ColumnExtractor(TransformerMixin):

    def __init__(self, columns=[]):
        self.columns = columns

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def transform(self, X, **transform_params):
        return X[self.columns]

    def fit(self, X, y=None, **fit_params):
        return self

In [7]:
class Reshaper(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.reshape((len(X), 1))

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

The classification is a mix out of the FU and nested pipelines inside a pipeline. Inside the FU features are extracted using the transformer and extractor classes. The tweets are being processed using the CountVectorizer and TfidfTransformer. Afterwards, using SelectKbest with chi2, best features are selected and passed to the classifier (BernoulliNB).

In [8]:
pipe = Pipeline([
    ('features', FeatureUnion
     ([      
        ('pos', Pipeline([
            ('extract', ColumnExtractor('pos')),
            ('to_dense', Reshaper())
        ])),
        ('neu', Pipeline([
            ('extract', ColumnExtractor('neu')),
            ('to_dense', Reshaper())
        ])),
        ('neg', Pipeline([
            ('extract', ColumnExtractor('neg')),
            ('to_dense', Reshaper())
        ])),
        ('tweet', Pipeline([
            ('extract', ColumnExtractor('tweet')),
            ('vect', CountVectorizer(ngram_range=(1, 3), max_df=0.3, min_df=0.0001, max_features=10000)),
            ('tfidf', TfidfTransformer(sublinear_tf= True, use_idf=False)),
            ("kbest", SelectKBest(chi2,k=650))
            ]))      
        ])),
        ('clf', BernoulliNB(alpha=1e-05))])
pipe.fit(X_train, y_train)

Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('pos', Pipeline(steps=[('extract', <__main__.ColumnExtractor object at 0x0000000008C2AAC8>), ('to_dense', <__main__.Reshaper object at 0x0000000008C2AB00>)])), ('neu', Pipeline(steps=[('extract', <__main__.ColumnExtractor object at...r_weights=None)), ('clf', BernoulliNB(alpha=1e-05, binarize=0.0, class_prior=None, fit_prior=True))])

Prediction of test set.

In [14]:
y_pred = pipe.predict(X_test)
y_pred_proba = pipe.predict_proba(X_test)
print(y_pred_proba )
#predict proba

[[  8.49224506e-01   1.50775494e-01]
 [  9.33564321e-01   6.64356795e-02]
 [  9.33564321e-01   6.64356795e-02]
 ..., 
 [  5.81542542e-01   4.18457458e-01]
 [  3.79453652e-01   6.20546348e-01]
 [  9.99861306e-01   1.38693880e-04]]


Evaluation of the classification. Precision and Recall is calculated and their respective F1 score, as well as a confusion matrix

In [10]:
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred, labels=None)
f1 = f1_score(y_test, y_pred)

print("Precision: "+str(precision))
print("Recall: "+str(recall))
print("F1 score: "+str(f1))
print("\nConfusion matrix: ")
print("\n   S    -S")
print(" S TP   FN")
print("-S FP   TN")
print(confusion)

Precision: 0.759002770083
Recall: 0.754820936639
F1 score: 0.756906077348

Confusion matrix: 

   S    -S
 S TP   FN
-S FP   TN
[[293  87]
 [ 89 274]]


In [11]:
precision_curve, recall_curve, thresholds = precision_recall_curve(y_test, y_pred)
precision_plot, recall_plot,_ = precision_recall_curve(y_test.ravel(), y_pred.ravel())
print(precision_curve, recall_curve, thresholds)
average_precision = average_precision_score(y_test.ravel(), y_pred.ravel())
print(average_precision)

plt.clf()
plt.plot(recall_plot, precision_plot, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall example: AUC={0:0.2f}'.format(average_precision))
plt.legend(loc="lower left")
plt.show()

[ 0.48855989  0.75900277  1.        ] [ 1.          0.75482094  0.        ] [0 1]
0.816804181759
