In [1]:
# Adds logging
import logging
logger = logging.getLogger('natural_language_analysis')
hdlr = logging.FileHandler('natural_language_analysis.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)

def cell_log(number, name=None):
    if name == None:
        logger.info("Cell no: {}".format(number))
    else:
        logger.info("Cell name: {} number {}".format(name, number))

import numpy as np
import pandas as pd

data_imported = pd.read_csv("data/kickstart_join_result.csv", sep="\t", error_bad_lines=False, encoding="utf-8" )
data_imported.head(10)
cell_log(0, "imports")

b'Skipping line 25531: expected 18 fields, saw 19\n'
b'Skipping line 231615: expected 18 fields, saw 19\n'


In [2]:
cell_log(1)
def remove_those_nan(string):
    if isinstance(string, float):
        if np.isnan(string):
            print("That's a nan")
        return False
    else:
        return True

#print(data_imported[data_imported["blurb"].apply(is_not_ascii)])
data_imported_nonan = data_imported.copy()
data_imported_nonan.loc[:, "blurb"] = data_imported_nonan.blurb.dropna(how="any")
data_imported_nonan = data_imported_nonan[data_imported_nonan.blurb.apply(remove_those_nan)]
print(data_imported_nonan.shape[0])
print(data_imported.shape[0])

That's a nan
That's a nan
That's a nan
248408
248411


## Goals for this notebook
<ul>
    <li>Use a naive bayes algorithm to predict the successfulness based on the blurb</li>
    <ul>
        <li>If this works I will attempt to extract the most sensitive words for this</li>
        <li>I would also give it a try to see if I can run an algorithm on the categories</li>
    </ul>
    <li>I would also like to give a try on what categories each belongs to</li>
</ul>

In [3]:
cell_log(2, "imports for nlp")
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report

# Bring in the stemmer and grab the stopwords
import nltk
from nltk.stem.snowball import EnglishStemmer
from nltk import word_tokenize
nltk.download("stopwords")
nltk.download('punkt')

import re

# Split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Encode y
Encoder = LabelEncoder()
y_encode = Encoder.fit_transform(data_imported_nonan.state)
X_train, X_test, y_train, y_test = train_test_split(data_imported_nonan.blurb, 
                                                    y_encode, 
                                                    test_size=0.2, 
                                                    random_state=42)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
cell_log(3, "tokenizer")
stemmer = EnglishStemmer(ignore_stopwords=True)
def stem_tokenizer(text):
    words = word_tokenize(text)
    to_return = []
    for word in words:
        to_return.append(stemmer.stem(word))
    logger.debug("Vocabulary: {}".format(' '.join(to_return)))
    return to_return

vect = HashingVectorizer(decode_error='ignore', n_features=2**25, tokenizer=stem_tokenizer, alternate_sign=False)

In [None]:
cell_log(4)
# Obtain the vectorized data
X_train_vect = vect.transform(X_train)


In [6]:
cell_log(5, "First bayes")
bayes_clf = MultinomialNB()
param_range = [0.001, 0.003, 0.01, 0.03, 0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 1, 2, 3, 4, 10, 100, 1.5, 3.5, 2.5]
param_grid = [{'alpha': param_range}]

# Searches for best parameters
logger.info("In we go")
gs = GridSearchCV(estimator=bayes_clf,
                  param_grid=param_grid,
                  scoring='roc_auc',
                  cv=10,
                  n_jobs=-1)
gs.fit(X_train_vect, y_train)
logger.info(gs.best_score_)
logger.info(gs.best_params_)

In [7]:
cell_log(6, "First bayes classification")
X_test_vect = vect.transform(X_test)
y_pred = gs.best_estimator_.predict(X_test_vect)
logger.info(classification_report(y_test, y_pred))

# So Naive Bayes Failed....
This isn't horrible, the fact that it failed can be due to two factors:
1. The premise that $P(A \cap B ) = P(A)P(B)$ is untrue, this is most likely
2. The hasing vectorizer, (which upon collision makes the numbers negative) I had to stop, this leads to the hashing vectorizer not giving the right count for each word.
I can try to fix this by using TF-IDF, I doubt it will however.

In [None]:
cell_log(7)
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cell_log(8)
vect = TfidfVectorizer(tokenizer=stem_tokenizer, min_df=100, ngram_range=(1, 2))
X_train_vect = vect.fit_transform(X_train)

logger.info("Word tokenizer results: {}\n Word tokenizer count {}".format(vect.vocabulary_, X_train_vect))

In [11]:
cell_log(9, "MulinomialNB: alpha train")
param_range = [0.001, 0.003, 0.01, 0.03, 0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 1, 2, 3, 4, 10, 100, 1.5, 3.5, 2.5]
param_grid = [{'alpha': param_range}]
# Searches for best parameters
logger.info("In we go")
bayes_clf = MultinomialNB()
gs = GridSearchCV(estimator=bayes_clf,
                  param_grid=param_grid,
                  scoring='roc_auc',
                  cv=10,
                  n_jobs=-1)
gs.fit(X_train_vect, y_train)
logger.info(gs.best_score_)
logger.info(gs.best_params_)
X_test_vect = vect.transform(X_test)
y_pred = gs.best_estimator_.predict(X_test_vect)
logger.info(classification_report(y_test, y_pred))

# That didn't go great. On to better algorithms!
I will give logistic regression a chance, it is usually a pretty good algorithm in general. I could also try SVC. For selecting algorithms, it is usually better to try with a k-folds approach.

In [None]:
cell_log(10)

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
cell_log(11, "Generate new classifiers")
forest = RandomForestClassifier(n_jobs=-1)
svc = SVC()
log_reg = LogisticRegression(n_jobs=-1)

In [None]:
cell_log(12, "Setup scoring")
# I will stick with the tf-idf
def fit_return_best_model(estimator, param_grid, n_jobs=1):
    gs = GridSearchCV(estimator=estimator, 
                      param_grid=param_grid,
                      n_jobs=n_jobs,
                      return_train_score = True,
                      cv=3)
    gs.fit(X_train_vect, y_train)
    estimator = gs.best_estimator_
    logger.info(gs.best_params_)
    return gs

In [None]:
cell_log(13, "Params")
forest_pg = {"n_estimators":[10, 100, 300], "max_depth": [10, 100, 150, None]}
svc_pg = {"C": [0.1, 1.0, 10.0, 100.0], "max_iter":[300], "gamma": [0.001, 0.01, 0.1, 1.0, 10.0], "kernel":["rbf"]}
log_pg = {"C": [0.1, 1.0, 10.0, 100.0], "max_iter": [100, 300]}

In [12]:
cell_log(14, "Forest training")
gs = GridSearchCV(estimator=forest, 
                  param_grid=forest_pg,
                  n_jobs=-1,
                  return_train_score=True,
                  cv=3)
gs.fit(X_train_vect, y_train)
estimator = gs.best_estimator_
logger.info(gs.best_params_)
logger.info(gs.cv_results_)
gs.cv_results_

{'mean_fit_time': array([7.97413031e-01, 6.30892936e+00, 3.15896173e+01, 6.87134093e+01,
        5.13363895e+02, 1.26000599e+03, 1.04533684e+02, 7.33956755e+02,
        1.45842115e+03, 1.59863864e+02, 9.39351279e+02, 1.28711966e+03]),
 'std_fit_time': array([  0.25798916,   0.83448468,   0.5537277 ,   0.44119494,
          1.78828608,  12.56800008,   2.74674405,   4.06843871,
         10.7118102 ,   0.72567371,  19.34078841, 102.09317185]),
 'mean_score_time': array([0.30627767, 1.99882452, 4.80826688, 1.92491484, 5.93891319,
        7.42606115, 2.23810109, 6.6667645 , 5.95167271, 2.64628943,
        6.64320135, 3.16917833]),
 'std_score_time': array([0.02940879, 0.19256987, 0.21828859, 0.07657123, 0.38232042,
        0.23759083, 0.1432155 , 0.18600764, 0.41217302, 0.05358285,
        0.40640357, 1.10644071]),
 'param_max_depth': masked_array(data=[10, 10, 10, 100, 100, 100, 150, 150, 150, None, None,
                    None],
              mask=[False, False, False, False, False, Fal

In [13]:
cell_log(15, "SVC")
gs_svc = fit_return_best_model(svc, svc_pg, -1)
logger.info(gs_svc.cv_results_)
gs_svc.cv_results_





{'mean_fit_time': array([20.87523071, 21.1697851 , 20.24706546, 21.19583861, 19.66412592,
        20.63612151, 20.17576877, 19.34408275, 18.10261766, 18.17180912,
        20.1820941 , 19.82349141, 19.18231058, 16.95996038, 17.47174017,
        18.92060057, 20.03993336, 17.29694867, 17.13173135, 16.55204344]),
 'std_fit_time': array([2.03600352, 2.08406011, 2.58865191, 2.07477121, 1.61075814,
        1.53934527, 1.3614882 , 2.36957229, 1.50939254, 0.63023183,
        1.09978926, 0.4276369 , 0.56362476, 2.35314076, 0.2115574 ,
        0.78977952, 1.12394386, 1.96730759, 1.96828418, 0.62362278]),
 'mean_score_time': array([8.84462969, 8.85943921, 8.51179187, 8.85028982, 7.98412665,
        8.19784896, 7.94473767, 7.52394978, 7.14724469, 7.40476068,
        8.58941205, 7.8695004 , 7.65068849, 6.76215172, 7.01587081,
        8.12741764, 7.33620834, 6.46826212, 5.65333581, 6.8031679 ]),
 'std_score_time': array([0.99186768, 1.11887186, 1.36358507, 1.25919129, 0.99486159,
        0.51658757, 

In [14]:
cell_log(16, "Logistic regression")
gs_log= fit_return_best_model(log_reg, log_pg, -1)
logger.info(gs_log.cv_results_)

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


In [15]:
logger.info("grid search SVC cv results")
logger.info(gs_svc.cv_results_)
logger.info("grid search logistic cv results")
logger.info(gs_log.cv_results_)