In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

sample_submission.csv
test.csv
train.csv



In [2]:
train = pd.read_csv('../input/train.csv')

In [3]:
def subsample_data(train, target_column):
    columns_roi = [ 'comment_text', target_column]

    positives = train[train[target_column]==1][columns_roi]
    pos_size = positives.shape[0]
    print('pos_size:', pos_size)
    neg_size = 2* pos_size
    negatives = train[train[target_column]==0][:neg_size][columns_roi]
    
    dataset = pd.concat([positives , negatives])
    dataset = dataset.sample(frac=1).reset_index(drop=True)
    return dataset

In [9]:
def build_svm(x_traincv, y_train):
    from sklearn.svm import SVC
    clf_svc = SVC(kernel = 'linear')

    print('Model fit start')
    clf_svc.fit(x_traincv,y_train)
    print('Model fit complete')
    return clf_svc


def build_lr(x_traincv, y_train):
    from sklearn.linear_model import LogisticRegression
    clf_lr = LogisticRegression(penalty='l1', verbose=1)
    print('Model fit start')
    clf_lr.fit(x_traincv,y_train)
    print('Model fit complete')
    return clf_lr


def prediction_and_metrics(clf_svc, x_testcv, y_test):
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import precision_recall_fscore_support
    
    predictions=clf_svc.predict(x_testcv)
    
    a=np.array(y_test)
    CM = confusion_matrix(a, predictions)
    print('\nConfusion matrix:\n', CM)
    score = precision_recall_fscore_support(a, predictions)
    print('\nprecision_recall_fscore_support:\n', score)


def find_most_important_features(cv, model):
    # supported only for linear kernels
    feature_names = cv.get_feature_names()
    coefs_with_fns = sorted(zip(model.coef_[0], feature_names))
    n = 50
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n+1):-1])
    
    print('\nMost informative features\n')
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print(coef_1, fn_1, '   ', coef_2, fn_2)

def vectorise(x_train, x_test):
    from sklearn.feature_extraction.text import TfidfVectorizer

    # tfidf vectoriser
    cv = TfidfVectorizer(min_df=3 , ngram_range=(1,3),max_df=0.7, sublinear_tf=1, lowercase=True)
    
    # feature generation
    x_traincv=cv.fit_transform(x_train)
    print('#features:', len(cv.get_feature_names()))
    
    x_testcv=cv.transform(x_test)
    return cv, x_traincv, x_testcv

    


def pipeline(train, target_column='insult'):
    from sklearn.cross_validation import train_test_split

    # data needs to be sub sampled here..
    dataset = subsample_data(train, target_column)
    
    # test train split
    df_x=dataset["comment_text"]
    df_y=dataset[target_column]
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)
    
    cv, x_traincv, x_testcv = vectorise(x_train, x_test)
    
    clf = build_lr(x_traincv,y_train)
    prediction_and_metrics(clf, x_testcv, y_test)
    
    find_most_important_features(cv, model=clf)
    return cv, clf

In [10]:
pipeline(train, target_column='insult')

pos_size: 4765
#features: 55662
Model fit start
[LibLinear]Model fit complete

Confusion matrix:
 [[1853   60]
 [ 172  774]]

precision_recall_fscore_support:
 (array([ 0.91506173,  0.92805755]), array([ 0.96863565,  0.81818182]), array([ 0.94108685,  0.86966292]), array([1913,  946]))

Most informative features

-9.05117567754 talk     26.8958810262 idiot
-7.00201413029 the     24.9695578311 fuck
-6.79547011414 thank you     21.8726308047 fucking
-5.23915413215 could     21.5762213494 bitch
-4.92636132616 please     21.2592102168 asshole
-4.74055806756 there     20.484091681 stupid
-4.6494764054 to     19.0059033172 ass
-4.35654123901 may     15.9232996142 cunt
-3.9197902382 for     15.4164276029 you
-3.75467503326 as     15.276919276 moron
-3.69422151508 article     14.4011858804 shit
-3.68505742499 wp     13.3884485642 dick
-3.5691003892 not     13.1794694521 idiots
-3.51929883118 thanks     13.0457464147 suck
-3.0693275372 welcome     12.9916970903 faggot
-2.97949151716 if     12.8

(TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.7, max_features=None, min_df=3,
         ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
         stop_words=None, strip_accents=None, sublinear_tf=1,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=1, warm_start=False))

In [11]:
my_targets = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']

from sklearn.externals import joblib

for target in my_targets:
    print('\nTarget:', target)
    
    cv, model = pipeline(train, target_column=target)
    
    cv_name = 'cv_jan2_' + target + '.pkl'
    model_name = 'clf_lr_jan2_' + target + '.pkl'
    print('Saving:', cv_name, model_name)
    
    joblib.dump(model, model_name)     
    joblib.dump(cv, cv_name)     
    
    print('==================\n')


Target: toxic
pos_size: 9237
#features: 108085
Model fit start
[LibLinear]Model fit complete

Confusion matrix:
 [[3510  142]
 [ 445 1446]]

precision_recall_fscore_support:
 (array([ 0.8874842 ,  0.91057935]), array([ 0.9611172 ,  0.76467478]), array([ 0.92283423,  0.83127335]), array([3652, 1891]))

Most informative features

-8.9775624768 thank you     48.6054685006 fuck
-7.9389860667 thanks     40.3307070214 fucking
-7.91052585192 talk     33.3733207919 shit
-7.85264067331 please     27.0930046668 stupid
-6.55771050249 the     25.750941718 idiot
-5.92524295423 may     24.5386966149 bullshit
-5.64004323839 utc     23.7854593036 ass
-5.50883688756 to     22.3005101449 asshole
-5.26222037244 for     21.6083020584 crap
-5.25533957422 article     19.6785430061 bitch
-5.16349512494 welcome     18.7737015996 suck
-4.83299403288 wp     17.735882746 dick
-4.79485289582 at     17.3884587901 hell
-4.69246642106 source     16.9930414246 pathetic
-4.55151333736 sources     16.372260103 idiots




Target: insult
pos_size: 4765
#features: 55798
Model fit start
[LibLinear]Model fit complete

Confusion matrix:
 [[1891   68]
 [ 161  739]]

precision_recall_fscore_support:
 (array([ 0.92153996,  0.9157373 ]), array([ 0.96528841,  0.82111111]), array([ 0.94290701,  0.86584651]), array([1959,  900]))

Most informative features

-10.6347302505 talk     27.5707381305 idiot
-6.85749269842 the     25.6100398262 fuck
-6.01186962921 there     24.473494262 fucking
-5.72567274711 thank you     21.510492141 bitch
-5.61507477531 as     20.6917873287 stupid
-5.39074575331 article     20.1752461898 asshole
-5.0928386626 to     18.5687766482 ass
-4.61919016553 thanks     14.489517203 idiots
-4.43347470707 could     14.4427404045 moron
-3.99877454685 please     14.2594739741 suck
-3.5917145286 not     14.1742084568 you
-3.40941843008 if you     13.4805590304 shit
-3.11088772349 see     13.2989480906 faggot
-3.06765103621 also     13.0829509334 cunt
-2.84640093532 been     12.4544731039 dick
-2.827