In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper")
y_labels = ["love", "haha", "wow", "angry", "sad"]

dataset = pd.read_json("data/preprocessed.json")
dataset = dataset.reset_index(drop=True)
dataset.shape

(9072, 29)

## Notes

- We found that a simple classifier could deal with the problem of predicting a reaction. At least 7 out of 10 headlines will be correctly classified.
- I would again like to point out that this predictive model will serve as a benchmark for future works.

## Final Classifier

**This one won't give the vector of probability estimates $P(y|x)$**

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import svm
from sklearn.pipeline import Pipeline

params = {'vect__ngram_range': (1, 2), 'tfidf__use_idf': True,
          'clf__C': 0.29999999999999999, 'vect__min_df': 2, 'vect__max_df': 1.0}

vectorizer = CountVectorizer(ngram_range=params['vect__ngram_range'],
                             min_df=params['vect__min_df'], max_df=params['vect__max_df'])
transformer = TfidfTransformer(use_idf=params['tfidf__use_idf'])
learning_model = svm.LinearSVC(C=params['clf__C'])

Size of the feature set

In [16]:
vectorizer.fit_transform(dataset["preprocessed_stem_stop"]).shape

(9072, 9302)

Pipeline

In [8]:
pipe_clf = Pipeline([('vect', vectorizer),
                     ('tfidf', transformer),
                     ('clf', learning_model)])

pipe_clf.fit(dataset["preprocessed_stem_stop"], dataset["highest_reaction"])

from sklearn.externals import joblib
joblib.dump(pipe_clf, 'output/final_model.pkl') 

['output/final_model.pkl',
 'output/final_model.pkl_01.npy',
 'output/final_model.pkl_02.npy',
 'output/final_model.pkl_03.npy',
 'output/final_model.pkl_04.npy',
 'output/final_model.pkl_05.npy']

**This one can compute the vector of probability estimates $P(y|x)$**

In [17]:
params = {'vect__ngram_range': (1, 2), 'vect__max_df': 1.0,
          'clf__n_iter': 10000, 'clf__loss': 'modified_huber',
          'tfidf__use_idf': True, 'vect__min_df': 1, 'clf__penalty': 'l2', 'clf__alpha': 1e-05}

vectorizer = CountVectorizer(ngram_range=params['vect__ngram_range'],
                             min_df=params['vect__min_df'], max_df=params['vect__max_df'])
transformer = TfidfTransformer(use_idf=params['tfidf__use_idf'])

from sklearn.linear_model import SGDClassifier
learning_model = SGDClassifier(n_iter=params['clf__n_iter'], loss=params['clf__loss'],
                               penalty=params['clf__penalty'], alpha=params['clf__alpha'])

Size of the feature set

In [18]:
vectorizer.fit_transform(dataset["preprocessed_stem_stop"]).shape

(9072, 49237)

Pipeline

In [10]:
pipe_sgd_clf = Pipeline([('vect', vectorizer),
                     ('tfidf', transformer),
                     ('clf', learning_model)])

pipe_sgd_clf.fit(dataset["preprocessed_stem_stop"], dataset["highest_reaction"])
from sklearn.externals import joblib
joblib.dump(pipe_sgd_clf, 'output/final_model_prob.pkl') 

['output/final_model_prob.pkl',
 'output/final_model_prob.pkl_01.npy',
 'output/final_model_prob.pkl_02.npy',
 'output/final_model_prob.pkl_03.npy',
 'output/final_model_prob.pkl_04.npy',
 'output/final_model_prob.pkl_05.npy',
 'output/final_model_prob.pkl_06.npy']