In [1]:
import numpy as np
import pandas as pd
import sklearn
import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from pprint import pprint
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import RFE
from sklearn import metrics

In [2]:
# Properties
py_file = "LinearSCVResults.py"
date_file = "data/dataset-4412-cleaned.csv"
start_time = time.time()
testd_size = 0.35
sarcastic_tweets = []
normal_tweet = []

In [3]:
# Print Properties
print("Start Time  : ",time.strftime("%Y-%m-%d %H:%M:%S"))
print("Python File : ",py_file)
print("Data File   : ",date_file)
print("Test Size   : ",testd_size)

Start Time  :  2015-12-09 15:07:55
Python File :  LinearSCVResults.py
Data File   :  data/dataset-4412-cleaned.csv
Test Size   :  0.35


In [4]:
# Read Data Set
df = pd.read_csv(date_file, delimiter='|', encoding="utf-8", quotechar='"', header=None, names=['tweetID','tweet', 'target'])
df = df.dropna()
df.head()
df["target"] = df["target"].convert_objects(convert_numeric=True)
print(df.isnull().sum())
df = df.dropna()

df["target"] = df["target"].astype(int)
print(df.isnull().sum())
df.head()

tweetID     0
tweet       0
target     10
dtype: int64
tweetID    0
tweet      0
target     0
dtype: int64


Unnamed: 0,tweetID,tweet,target
0,270475999416819712,Geweldig werk Sandra! Deze homemade magneten z...,0
1,340482726610231296,Awel ik snap natuurlijk dat er veel factoren m...,0
2,305797553847406593,Dit weekend druk gehad met het keurig niets ze...,0
3,382202750257602560,Goed om te lezen Kees. Ik wens naast veel reis...,0
4,471366625405698048,"dat moet als het goed is gewoon kunnen, je ben...",0


In [5]:
df['f1'] = pd.Series(np.random.uniform(0,1,len(df)), index=df.index)
df.head()

Unnamed: 0,tweetID,tweet,target,f1
0,270475999416819712,Geweldig werk Sandra! Deze homemade magneten z...,0,0.873436
1,340482726610231296,Awel ik snap natuurlijk dat er veel factoren m...,0,0.820199
2,305797553847406593,Dit weekend druk gehad met het keurig niets ze...,0,0.994451
3,382202750257602560,Goed om te lezen Kees. Ik wens naast veel reis...,0,0.589166
4,471366625405698048,"dat moet als het goed is gewoon kunnen, je ben...",0,0.671962


In [6]:
df.dtypes

tweetID      int64
tweet       object
target       int32
f1         float64
dtype: object

In [17]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(df[['tweet','f1']]
, df['target'], test_size=0.25, random_state=0)
# X_train, X_test, y_train, y_test = cross_validation.train_test_split(df['tweet']
# , df['target'], test_size=0.25, random_state=0)

In [19]:
from sklearn.base import TransformerMixin
from sklearn.feature_extraction import DictVectorizer


class ColumnExtractor(TransformerMixin):

    def __init__(self, columns=[]):
        self.columns = columns

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def transform(self, X, **transform_params):
        return X[self.columns]

    def fit(self, X, y=None, **fit_params):
        return self

In [20]:
from sklearn.pipeline import FeatureUnion

transformer = Pipeline([
    ('features', FeatureUnion
     ([      
# F1 extract and process
        ('f1', Pipeline([
            ('extract', ColumnExtractor(['f1'])),
#             ('cntvect', DictVectorizer())                    
        ])),
        ('tweet', Pipeline([
            ('extract', ColumnExtractor(['tweet'])),
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer())
        ]))
]))])
        
pipe = Pipeline([
        ('extract', transformer),
        ('clf', MultinomialNB())
    ])        


pipe.fit(X_train, y_train)

ValueError: blocks[0,:] has incompatible row dimensions

In [14]:
pipe = Pipeline([
    ('features', FeatureUnion
     ([      
# F1 extract and process
        ('f1', Pipeline([
            ('extract', ColumnExtractor(['f1']))
        ]))
# tweet itself: cntvector,etc...                    
        ,('tweet', Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer())
                                
        ]))
                    
                ])),
        ('clf', MultinomialNB())])

In [15]:
pipe.fit(X_train, y_train)

ValueError: blocks[0,:] has incompatible row dimensions

In [None]:
# Classifier

# pipe = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', MultinomialNB())
# ])

parameters = {'vect__ngram_range': ((1, 3), (2, 3), (1, 2)), 
              'vect__max_df': (0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.7),
#               'vect__min_df': (0.1,0.25,0.5,0.75, 1.0),
              'tfidf__use_idf': (True, False),
              'tfidf__sublinear_tf': (True, False),
              'clf__alpha': (0.00001, 0.000001),}

pipe.fit(X_train, y_train)
# y_pred = pipe.predict(X_test)

In [None]:
# Create Train and Test Set
print("\nTweets Train Set : ",len(X_train))
print("Tweets Test  Set : ",len(X_test))

# Grid Search

In [None]:
from sklearn.grid_search import GridSearchCV

grid_search = GridSearchCV(pipe,parameters,n_jobs=2)
grid_search.fit(X_train, y_train)

print(grid_search.best_estimator_)

In [None]:
best_params =grid_search.best_estimator_.get_params(deep=True)
for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_params[param_name]))

In [None]:
y_pred_grid = grid_search.best_estimator_.predict(X_test)

In [None]:
print("Best score: %0.3f" % grid_search.best_score_)

# RFE

In [None]:
# create the RFE model and select 3 attributes
count_vect=pipe.named_steps['vect']
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape
rfe = RFE(pipe.named_steps['clf'] ,n_features_to_select=1000, step=1, verbose=0)
rfe = rfe.fit(X_train_counts, y_train)

In [None]:
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

In [None]:
# for i,j in sorted(zip(rfe.ranking_,rfe.support_)):
#     print (i,j)

In [None]:
print("Tweets Test  Set : ",len(X_test))

In [None]:
from sklearn.metrics import recall_score, precision_score, confusion_matrix, f1_score, precision_recall_curve,average_precision_score

recall = recall_score(y_test, y_pred_grid)
precision = precision_score(y_test, y_pred_grid)
confusion = confusion_matrix(y_test, y_pred_grid, labels=None)
f1 = f1_score(y_test, y_pred_grid)

print("Precision: "+str(precision))
print("Recall: "+str(recall))
print("\nConfusion matrix: ")
print(confusion)
print("\nF1 score: "+str(f1))

In [None]:
import matplotlib.pyplot as plt

precision_curve, recall_curve, thresholds = precision_recall_curve(y_test, y_pred_grid)
precision_plot, recall_plot,_ = precision_recall_curve(y_test.ravel(), y_pred_grid.ravel())
print(precision_curve, recall_curve, thresholds)
average_precision = average_precision_score(y_test.ravel(), y_pred_grid.ravel())
print(average_precision)

# Plot Precision-Recall curve
plt.clf()
plt.plot(recall_plot, precision_plot, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall example: AUC={0:0.2f}'.format(average_precision))
plt.legend(loc="lower left")
plt.show()

In [None]:
# c = 0
# # count = 0
# # count1 = 0
# FP = 0
# TN = 0
# TP = 0
# FN = 0
# precisionarray=[]
# recallarray=[]
# # accuracycount=0
# ifcount=1
# for item, labels in zip(X_test, y_pred):
# # 		accuracycount+=1
#     if np.asarray(y_test[c]) != np.asarray(y_pred[c]):
#         if np.asarray(y_test[c]) > np.asarray(y_pred[c]):
#             FN+=1
#         else:
#             FP+=1
#     else:
#         if np.asarray(y_test[c]) == 1:
#             TP+=1
#         else:
#             TN+=1
#     c+=1
#     ifcount+=1
    
# # precision = (TP/(TP+FP))
# precisionarray.append(precision)
# recall = (TP/(TP+FN))
# recallarray.append(recall)

In [None]:
vectorizer = grid_search.best_estimator_.named_steps['vect'].get_params(deep=True)

In [None]:
vectorizer

In [None]:
# print("\nPrediction : %d wrong from %d" %(count, len(y_test)))
# print("False positive: %d" % (FP))
# print("True negative: %d" % (TN))
# print("False negative: %d" % (FN))
# print("True positive: %d" % (TP))
# accuracy_test1 = (len(X_test)-count)/len(X_test)
# print("Accuracy   :", (accuracy_test1), "\n")
# precision = (TP/(TP+FP))
# print("Precision   :", (precision), "\n")
# recall = (TP/(TP+FN))
# print("Recall   :", (recall), "\n")
# print(precisionarray)
# print(recallarray)

In [None]:
# Labels
target_names = [1,0]

In [None]:
# Test Set
X_test1 = np.array(['Gotta love #ns #sarcasm, wat een klote begin van de dag.',
                   'Gotta love ns sarcasm, wat een klote begin van de dag.',
                   'Gotta love ns, wat een klote begin van de dag.',
                   'Helaas mijn overstap in Eindhoven niet gehaald',
                   'Vertraging maar dan in je voordeel',
                   'Ik ga so meteen naar huis vanuit school',
                   'weer bezig'])

In [None]:
predicted1 = pipe.predict(X_test1)

for item, labels in zip(X_test1, predicted1):
    print(item, '-', target_name[labels])

In [None]:
print("\nTotal Time : %s Seconds" % (time.time() - start_time))