In [1]:
import pandas as pd
import numpy as np
import pickle
import itertools
import functools
import collections
import random

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.utils.fixes import signature
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score


from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from gensim.utils import simple_preprocess

import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
df_eq = pd.read_pickle('../dataframes/df_eq_label.pkl')

In [3]:
df = pd.read_pickle('../dataframes/df_clean.pkl')

In [4]:
# EQ Filter function

def process(df):
    df.dropna(subset=['body', 'headline', 'summary'], thresh=3, inplace=True)

    df['categories'] = df['categories'].apply(lambda x: '. '.join(x))

    df['train'] = [t + '. ' + h + '. ' + s + ' ' + b  for t, h, s, b in
                            zip(list(df['categories']), list(df['headline']), list(df['summary']), list(df['body']))]
    
    return df

In [5]:
#df = process(df)
df_eq = process(df_eq)

In [6]:
X = list(df_eq['train'])
y = list(df_eq['label'])

In [7]:
# Stratified test/train split

def strat_test_train(X, y, test_size):
    strat = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
    
    for train_index, test_index in strat.split(X, y):
        X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
    return X_train, y_train, X_test, y_test

In [8]:
X_train, y_train, X_test, y_test = strat_test_train(X, y, 0.2)

In [9]:
# Read corpus function

def read_corpus(data):
    for i, line in enumerate(data):
        yield TaggedDocument(simple_preprocess(line), tags=[i])  

In [10]:
X_train = list(read_corpus(X_train))

In [11]:
X_test = list(read_corpus(X_test))

In [12]:
#all_data = list(read_corpus(list(df['train'])))

In [13]:
# d2v function

def doc2vec(data, vector_size, min_count, epochs, window):
    model = Doc2Vec(vector_size=vector_size, min_count=min_count, window=window, epochs=epochs)
    model.build_vocab(data)
    model.train(data, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model

In [14]:
# Train doc2vec model with all documents:

# d2v_model = doc2vec(X_train, vector_size=400, min_count=2, epochs=20, window=5)

In [15]:
# with open('../models/d2v_model.pkl', 'wb') as f:
#         pickle.dump(d2v_model, f)

with open('../models/d2v_model.pkl', 'rb') as f:
    d2v_model = pickle.load(f)

In [16]:
def embeddings(model, X, steps):
    z = [model.infer_vector(X[doc_id].words, steps=steps) for doc_id in range(len(X))]
    return z

In [17]:
X_train = embeddings(d2v_model, X_train, 30)

In [18]:
X_test = embeddings(d2v_model, X_test, 30)

In [19]:
len(X_train), len(y_train)

(1736, 1736)

In [20]:
clf = svm.SVC()
clf.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [21]:
y_pred = clf.predict(X_test)

In [22]:
p, r, a = precision_score(y_test, y_pred), recall_score(y_test, y_pred), accuracy_score(y_test, y_pred)

In [23]:
a

0.8896551724137931

In [24]:
r

0.04

In [25]:
p

1.0

In [26]:
c = confusion_matrix(y_test, y_pred)

In [27]:
c

array([[385,   0],
       [ 48,   2]])