# 0 - Prelim

Load packages and run requisite tasks

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import luigi
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV, LassoLarsCV
from sklearn.metrics import accuracy_score
from nltk import word_tokenize
PROJECT_DIR = os.path.join(os.getcwd(), os.pardir)
os.chdir(PROJECT_DIR)
from src.data.clean import CleanData

DEBUG: Checking if CleanData() is complete
INFO: Informed scheduler that task   CleanData__99914b932b   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Worker(salt=278942626, workers=1, host=DESKTOP-6UJS098, username=wertu, pid=8132) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 1 tasks of which:
* 1 present dependencies were encountered:
    - 1 CleanData()

Did not run any tasks
This progress looks :) because there were no failed tasks or missing external dependencies

===== Luigi Execution Summary =====



True

# 1 - Introduction

This notebook will train a baseline linear logistic regression model. The features will be created using the bag of words technique. This means that the input will be known as a document term matrix. The document term matrix will be weighted using term frequency-inverse document (tf-idf) scaling.

The logistic regression will be regularized using the l2 norm. This means that a penalty will be applied to the l2 norm of the coefficients. This will shrink the coeficent towds 0 and towards each other. The use of the l2 penalty in a regresion is also known as ridge regression. The optimal regularization penalty will be determined by 10 fold cross validation. In addition to using cross-fold validation to determine the regularization strnegth, 

## 1.1 Prep the Data

In [None]:
#This luigi task ensures that the cleaned train and test sets are available.
luigi.build([CleanData()], local_scheduler = True)

In [None]:
#Load data
train = joblib.load('data/interim/train.pkl')
test = joblib.load('data/interim/test.pkl')

# N-gram selection through cv

In [6]:
def validate_dtm(ngram_range):
    tfidf = TfidfVectorizer(analyzer = 'word', ngram_range = (1,ngram_range+1), max_df = 0.8, min_df = 5)
    dtm_train = tfidf.fit_transform(train.full_text)
    vocab_len = len(tfidf.vocabulary_)
    print("DTM {} has {} words".format(ngram_range, vocab_len))
    mdl = LogisticRegressionCV(cv = 10, solver = 'lbfgs', max_iter=1000, n_jobs = 2,
                                   verbose = True, random_state = 10222017)
    mdl.fit(dtm_train, train.funny)
    
    return pd.DataFrame(mdl.scores_[True]).mean()

In [7]:
def ngram_cv(ngram_range):
    ngram_results = {}
    for i in range(ngram_range):
        print("Starting DTM {}".format(i))
        reg_results = validate_dtm(i)
        ngram_results[i] = reg_results
    return ngram_results

In [8]:
results = ngram_cv(5)

Starting DTM 0
DTM 0 has 26667 words


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed: 16.5min finished


Starting DTM 1
DTM 1 has 223483 words


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed: 53.1min finished


Starting DTM 2
DTM 2 has 472114 words


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed: 82.5min finished


Starting DTM 3
DTM 3 has 654041 words


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed: 103.5min finished


Starting DTM 4
DTM 4 has 792235 words


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed: 119.6min finished


In [9]:
results

{0: 0    0.538029
 1    0.600718
 2    0.611911
 3    0.634725
 4    0.661174
 5    0.664074
 6    0.655812
 7    0.649206
 8    0.645745
 9    0.644843
 dtype: float64, 1: 0    0.513502
 1    0.592812
 2    0.605269
 3    0.633961
 4    0.676118
 5    0.685680
 6    0.666393
 7    0.645414
 8    0.633573
 9    0.628308
 dtype: float64, 2: 0    0.513456
 1    0.584642
 2    0.601120
 3    0.632426
 4    0.677459
 5    0.688051
 6    0.670043
 7    0.652799
 8    0.643864
 9    0.638645
 dtype: float64, 3: 0    0.513451
 1    0.576374
 2    0.598414
 3    0.632630
 4    0.678055
 5    0.687668
 6    0.670624
 7    0.654430
 8    0.645628
 9    0.640719
 dtype: float64, 4: 0    0.513446
 1    0.570482
 2    0.597318
 3    0.633884
 4    0.678157
 5    0.687464
 6    0.670506
 7    0.654772
 8    0.646270
 9    0.640429
 dtype: float64}

In [10]:
from src.features.dtm import CreateDTM

seems like  max_ngram of 3 is best....

In [2]:
from math import log

In [3]:
log(470000,2)

18.842301231227086

In [10]:
470000 ** 0.5

685.5654600401044

In [1]:
(685.5/470000)

0.0014585106382978723

In [2]:
(18.8/470000)

4e-05

In [9]:
470000 * 0.00001

4.7

In [3]:
685 / 19

36.05263157894737