In [24]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import pickle


In [2]:
df_val = pd.read_csv('../tmp/prepared_data/prepared_val.csv')
df_train = pd.read_csv('../tmp/prepared_data/prepared_train.csv')

In [3]:
X_train = list(df_train.description.astype(str))
X_val = list(df_val.description.astype(str))
y_train = df_train.is_bad
y_val = df_val.is_bad

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state = 42)

In [15]:
tfv = TfidfVectorizer(strip_accents='unicode', analyzer='char_wb', ngram_range=(1, 5), use_idf=1, smooth_idf=1, sublinear_tf=1)

In [16]:
%%time
tfv.fit(X_train)

Wall time: 30min 25s


TfidfVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 5), norm='l2', preprocessor=None,
                smooth_idf=1, stop_words=None, strip_accents='unicode',
                sublinear_tf=1, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=1, vocabulary=None)

In [17]:
%%time
X_train_tfv = tfv.transform(X_train)
X_test_tfv = tfv.transform(X_test)
X_val_tfv = tfv.transform(X_val)

Wall time: 36min 27s


In [18]:
clf = LogisticRegression(C=23.0, n_jobs=-1)

In [33]:
pickle.dump(X_train_tfv, open('X_train_tfv.pkl', 'wb'))

In [34]:
pickle.dump(X_test_tfv, open('X_test_tfv.pkl', 'wb'))
pickle.dump(X_val_tfv, open('X_val_tfv.pkl', 'wb'))

In [19]:
clf.fit(X_train_tfv, y_train)

LogisticRegression(C=23.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
y_pred_test = clf.predict(X_test_tfv)
metrics.roc_auc_score(y_test, y_pred_test)

0.9143463278489553

In [21]:
y_pred_val = clf.predict(X_val_tfv)
metrics.roc_auc_score(y_val, y_pred_val)

0.8645906399883783

In [22]:
y_pred_train = clf.predict(X_train_tfv)
metrics.roc_auc_score(y_train, y_pred_train)

0.9288085159282982

In [25]:
pickle.dump(clf, open('TFIDF_char_wb-1-5_LogReg_C-23.pkl', 'wb'))