In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper")
y_labels = ["love", "haha", "wow", "angry", "sad"]

dataset = pd.read_json("data/preprocessed.json")
dataset = dataset.reset_index(drop=True)
dataset.shape

(9072, 29)

## Notes

- We found that a simple classifier could deal with the problem of predicting a reaction. At least 7 out of 10 headlines will be correctly classified.
- I would again like to point out that this predictive model will serve as a benchmark for future works.

## Final Classifier

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import svm
from sklearn.pipeline import Pipeline

params = {'vect__ngram_range': (1, 2), 'tfidf__use_idf': True,
          'clf__C': 0.29999999999999999, 'vect__min_df': 2, 'vect__max_df': 1.0}

vectorizer = CountVectorizer(ngram_range=params['vect__ngram_range'],
                             min_df=params['vect__min_df'], max_df=params['vect__max_df'])
transformer = TfidfTransformer(use_idf=params['tfidf__use_idf'])
learning_model = svm.LinearSVC(C=params['clf__C'])

In [21]:
pipe_clf = Pipeline([('vect', vectorizer),
                     ('tfidf', transformer),
                     ('clf', learning_model)])

pipe_clf.fit(dataset["preprocessed_stem_stop"], dataset["highest_reaction"])

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

## Test data

Testing with unseen data from August 29th to September 4th.

In [78]:
def slice_dataset(df, start_time, end_time):
    return df[(df['time_created'] >= start_time) & (df['time_created'] <= end_time)]
    
dataset_sliced = slice_dataset(dataset, "2016-07-15", "2016-08-30")

In [79]:
y_true = dataset_sliced["highest_reaction"]

In [80]:
y_predicted = pipe_clf.predict(dataset_sliced["preprocessed_stem_stop"])

In [82]:
np.random.seed(0)
d = {"Headline": dataset_sliced["name"],
     "$y_{pred}$": y_predicted,
     "$y_{true}$": y_true}

table_test = pd.DataFrame(data=d, columns=["Headline", "$y_{pred}$", "$y_{true}$"])

In [94]:
headlines = ["isis"]
pipe_clf.predict(headlines)

array([0])

## Pickle

In [None]:
from sklearn.externals import joblib
joblib.dump(pipe_clf, 'output/final_model.pkl') 