In [2]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
import lightgbm as lgb
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import sys

In [3]:
train = pd.read_csv('../data/train.csv')

In [5]:
X_columns = list()
for i, j in zip(train.columns, train.dtypes):
    if j in ['int64', 'float64'] and i not in ['target', 'id']:
        X_columns.append(i)

X_columns[:5]

['severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']

In [28]:
X_train = train[X_columns]
y_train = np.where(train['target'] >= 0.5, 1, 0)

X_train.head()

Unnamed: 0,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,...,parent_id,article_id,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,0.0,0.0,0.0,0.0,0.0,,,,,,...,,2006,0,0,0,0,0,0.0,0,4
1,0.0,0.0,0.0,0.0,0.0,,,,,,...,,2006,0,0,0,0,0,0.0,0,4
2,0.0,0.0,0.0,0.0,0.0,,,,,,...,,2006,0,0,0,0,0,0.0,0,4
3,0.0,0.0,0.0,0.0,0.0,,,,,,...,,2006,0,0,0,0,0,0.0,0,4
4,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,...,,2006,0,0,0,1,0,0.0,4,47


In [29]:
y_train[:5]

array([0, 0, 0, 0, 1])

In [30]:
k = 3

train_ids = X_train.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)

3

In [31]:
lgb_model = lgb.LGBMClassifier(metric='auc')

ft_importances = np.zeros(X_train.shape[1])

counter = 1
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train[train_index], y_train[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val)],
                  verbose=100,
                  early_stopping_rounds=100)

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

#     print('Guardamos el modelo')
#     joblib.dump(lgb_model, 'saved_models/{}_{}.pkl'.format(model_name, counter))

    ft_importances += lgb_model.feature_importances_

    counter += 1
    
    
imp = pd.DataFrame({'feature': X_train.columns, 'importance': ft_importances/k})
df_imp_sort = imp.sort_values('importance', ascending=False)

df_imp_sort.head(20)

Fold 1

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.997977
Did not meet early stopping. Best iteration is:
[80]	valid_0's auc: 0.99798
Fold 2

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.997855
Did not meet early stopping. Best iteration is:
[77]	valid_0's auc: 0.998025
Fold 3

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.998012
Did not meet early stopping. Best iteration is:
[83]	valid_0's auc: 0.998012


Unnamed: 0,feature,importance
3,insult,456.0
2,identity_attack,388.0
4,threat,342.666667
37,sexual_explicit,314.0
1,obscene,307.666667
39,toxicity_annotator_count,219.0
30,parent_id,65.666667
31,article_id,48.0
0,severe_toxicity,41.666667
35,likes,27.666667


In [21]:
df_imp_sort.tail(10)

Unnamed: 0,feature,importance
6,atheist,4.666667
15,intellectual_or_learning_disability,4.666667
24,other_sexual_orientation,3.333333
7,bisexual,3.333333
9,buddhist,2.666667
12,heterosexual,2.333333
21,other_gender,1.666667
20,other_disability,1.666667
13,hindu,1.333333
25,physical_disability,1.0


In [32]:
test = pd.read_csv('../data/test.csv')

In [33]:
test.head()

Unnamed: 0,id,comment_text
0,7000000,Jeff Sessions is another one of Trump's Orwell...
1,7000001,I actually inspected the infrastructure on Gra...
2,7000002,No it won't . That's just wishful thinking on ...
3,7000003,Instead of wringing our hands and nibbling the...
4,7000004,how many of you commenters have garbage piled ...
