In [1]:
import numpy as np
import pandas as pd
import sklearn
import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from pprint import pprint
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import RFE
from sklearn import metrics

In [2]:
# Properties
py_file = "LinearSCVResults.py"
date_file = "data/dataset-4412-cleaned.csv"
start_time = time.time()
testd_size = 0.35
sarcastic_tweets = []
normal_tweet = []

In [3]:
# Read Data Set
df = pd.read_csv(date_file, delimiter='|', encoding="utf-8", quotechar='"', header=None, names=['tweetID','tweet', 'target'])
df = df.dropna()
df.head()
df["target"] = df["target"].convert_objects(convert_numeric=True)
print(df.isnull().sum())
df = df.dropna()

df["target"] = df["target"].astype(int)
print(df.isnull().sum())

tweetID     0
tweet       0
target     10
dtype: int64
tweetID    0
tweet      0
target     0
dtype: int64


In [4]:
df['f1'] = pd.Series(np.random.uniform(0,15,len(df)), index=df.index)

In [5]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(df[['tweet','f1']]
, df['target'], test_size=0.25, random_state=0)

In [6]:
from sklearn.base import TransformerMixin
class ColumnExtractor(TransformerMixin):

    def __init__(self, columns=[]):
        self.columns = columns

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        #print(self.transform(X))
        return self.transform(X)

    def transform(self, X, **transform_params):
        #print(X[self.columns])
        return X[self.columns]

    def fit(self, X, y=None, **fit_params):
        #print(self)
        return self

In [7]:
class Reshaper(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        #print(X.reshape((len(X), 1)))
        return X.reshape((len(X), 1))

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        #print(self.transform(X))
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        #print(self)
        return self

In [8]:
## WORKING!!!!
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
pipe = Pipeline([
    ('features', FeatureUnion
     ([      
# F1 extract and process
        ('f1', Pipeline([
            ('extract', ColumnExtractor('f1')),
            ('to_dense', Reshaper())
        ]))
        ,('tweet', Pipeline([
            ('extract', ColumnExtractor('tweet')),
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer())
        ]))
                    
                ])),
        ('clf', MultinomialNB())])


In [None]:
parameters = {'features__tweet__vect__ngram_range': ((1, 3), (2, 3), (1, 2)), 
              'features__tweet__vect__max_df': (0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.7),
#               'vect__min_df': (0.1,0.25,0.5,0.75, 1.0),
              'features__tweet__tfidf__use_idf': (True, False),
              'features__tweet__tfidf__sublinear_tf': (True, False),
              'clf__alpha': (0.00001, 0.000001),}

In [None]:
from sklearn.grid_search import GridSearchCV

grid_search = GridSearchCV(pipe,parameters,n_jobs=2)
grid_search.fit(X_train, y_train)

print(grid_search.best_estimator_)

In [None]:
best_params =grid_search.best_estimator_.get_params(deep=True)
for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_params[param_name]))

In [None]:
y_pred_grid = grid_search.best_estimator_.predict(X_test)

In [None]:
print("Best score: %0.3f" % grid_search.best_score_)

In [None]:
from sklearn.metrics import recall_score, precision_score, confusion_matrix, f1_score, precision_recall_curve,average_precision_score

recall = recall_score(y_test, y_pred_grid)
precision = precision_score(y_test, y_pred_grid)
confusion = confusion_matrix(y_test, y_pred_grid, labels=None)
f1 = f1_score(y_test, y_pred)

print("Precision: "+str(precision))
print("Recall: "+str(recall))
print("\nConfusion matrix: ")
print(confusion)
print("\nF1 score: "+str(f1))