In [None]:
# This notebook helps us determine the value of parameters of logistic regression

In [1]:
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
train = read_csv('train.csv')
test = read_csv('test.csv')

In [3]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [4]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [5]:
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

In [6]:
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [8]:
# toxic
toxic_x = trn_term_doc

In [12]:
toxic_y = train.values[:, 2]

In [13]:
label_encoded_y = LabelEncoder().fit_transform(toxic_y)

In [14]:
# model = XGBClassifier()
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
C = range(1, 20, 1)
param_grid = dict(C=C)

In [65]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)

In [66]:
grid_result = grid_search.fit(toxic_x, label_encoded_y)

In [67]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: -0.102855 using {'C': 10}


In [68]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" %(mean, stdev, param))

-0.123175 (0.001636) with: {'C': 1}
-0.112540 (0.001810) with: {'C': 2}
-0.108126 (0.001950) with: {'C': 3}
-0.105824 (0.002062) with: {'C': 4}
-0.104486 (0.002157) with: {'C': 5}
-0.103702 (0.002239) with: {'C': 6}
-0.103236 (0.002314) with: {'C': 7}
-0.102983 (0.002379) with: {'C': 8}
-0.102870 (0.002439) with: {'C': 9}
-0.102855 (0.002494) with: {'C': 10}
-0.102912 (0.002545) with: {'C': 11}
-0.103018 (0.002592) with: {'C': 12}
-0.103161 (0.002638) with: {'C': 13}
-0.103332 (0.002680) with: {'C': 14}
-0.103523 (0.002718) with: {'C': 15}
-0.103728 (0.002756) with: {'C': 16}
-0.103948 (0.002789) with: {'C': 17}
-0.104173 (0.002823) with: {'C': 18}
-0.104411 (0.002858) with: {'C': 19}
