In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from string import punctuation
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc

In [2]:
profane = pd.read_csv('../data/download/profane.txt', skiprows=1, header=None)
profane = set(profane[0].values)
models = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

def tags(x):
    x = x.replace('\n',' ')
    table = str.maketrans('', '', punctuation)
    x = x.lower().translate(table)
    text = nltk.word_tokenize(x)
    text = nltk.pos_tag(text)
    return text

In [3]:
train_data = pd.read_csv('../data/download/train.csv')
train_feats = train_data[['id']]
train_labels = train_data.iloc[:, 2:]
train_data = train_data['comment_text']
train_tags = train_data.map(tags)

In [4]:
test_data = pd.read_csv('../data/download/test.csv')
test_feats = test_data[['id']]
test_data = test_data['comment_text']
test_tags = test_data.map(tags)

In [5]:
def count_profane(string):
    count = 0
    for word in profane:
        if string.lower().find(word) > -1:
            count += 1
    return count / max(len(string.split()),1)

def count_capital(string):
    count = 0
    for char in string:
        if char.isupper():
            count += 1
    return count / max(len(string),1)

def count_punct(string):
    count = 0
    string = string.replace('\n','')
    for char in string:
        if char in punctuation:
            count += 1
    return count

def count_pronoun(text):
    return np.sum([1 if 'PRP' in x[1] else 0 for x in text]) / max(len(text),1)

def count_verb(text):
    return np.sum([1 if 'VB' in x[1] else 0 for x in text]) / max(len(text),1)

def count_adj(text):
    return np.sum([1 if 'JJ' in x[1] else 0 for x in text]) / max(len(text),1)

def count_det(text):
    return np.sum([1 if 'DT' in x[1] else 0 for x in text]) / max(len(text),1)

In [6]:
train_feats['profane'] = train_data.map(count_profane)
train_feats['capital'] = train_data.map(count_capital)
train_feats['punct'] = train_data.map(count_punct)
train_feats['pronoun'] = train_tags.map(count_pronoun)
train_feats['verb'] = train_tags.map(count_verb)
train_feats['adject'] = train_tags.map(count_adj)
train_feats['deter'] = train_tags.map(count_det)

In [7]:
test_feats['profane'] = test_data.fillna('nan').map(count_profane)
test_feats['capital'] = test_data.fillna('nan').map(count_capital)
test_feats['punct'] = test_data.fillna('nan').map(count_punct)
test_feats['pronoun'] = test_tags.map(count_pronoun)
test_feats['verb'] = test_tags.map(count_verb)
test_feats['adject'] = test_tags.map(count_adj)
test_feats['deter'] = test_tags.map(count_det)

In [8]:
train_feats.to_csv('../data/download/train_feats.csv', index=False)
test_feats.to_csv('../data/download/test_feats.csv', index=False)

### scale data

In [9]:
train_feats = pd.read_csv('../data/download/train_feats.csv')
test_feats = pd.read_csv('../data/download/test_feats.csv')

In [10]:
scaler = StandardScaler()
train_feats.iloc[:,1:] = scaler.fit_transform(train_feats.iloc[:,1:])
test_feats.iloc[:,1:] = scaler.transform(test_feats.iloc[:,1:])

In [11]:
train_feats.head(10)

Unnamed: 0,id,profane,capital,punct,pronoun,verb,adject,deter
0,0000997932d777bf,-0.385824,0.137572,-0.160586,-0.381282,-0.478912,0.016625,-0.010667
1,000103f0d9cfb60f,-0.100636,0.213346,-0.106015,-0.178688,-0.313195,-0.481361,-0.591071
2,000113f07ec002fd,-0.381384,-0.371132,-0.269727,0.420521,-0.143533,-0.644511,-0.377151
3,0001b41b1c6bb37e,-0.501336,-0.365558,0.139553,-0.24085,-0.005935,-0.135604,0.279002
4,0001d958c54c6e35,0.04449,-0.234513,-0.297013,2.650712,0.344574,-1.337898,-0.283904
5,00025465d4725e87,-0.572293,-0.390336,-0.324298,0.349286,-2.450943,-1.337898,-0.046547
6,0002bcb3da6cb337,6.44361,8.501863,-0.43344,2.967158,-0.936705,-1.337898,-1.589367
7,00031b1e95af7921,-0.572293,-0.181389,-0.324298,1.321639,1.183228,-1.337898,-0.740816
8,00037261f536c51d,-0.282479,-0.396304,0.084983,0.439403,-0.261683,-0.636157,0.250864
9,00040093b2687caa,0.095888,-0.248293,-0.43344,-1.14664,-1.441451,-0.124471,2.653388


In [12]:
test_feats.head(10)

Unnamed: 0,id,profane,capital,punct,pronoun,verb,adject,deter
0,00001cee341fdb12,1.209524,-0.438651,-0.106015,0.681715,0.409285,0.886718,-1.353658
1,0000247867823ef7,0.095888,0.951968,-0.269727,0.498879,1.183228,1.574327,0.107735
2,00013b17ad220c46,-0.572293,0.241842,-0.215157,-1.14664,-0.431959,-1.337898,-1.589367
3,00017563c3f7919a,-0.572293,-0.345875,-0.269727,0.152454,-0.219434,-0.571523,1.090268
4,00017695ad8997eb,-0.572293,-0.293331,-0.378869,-1.14664,-0.720385,0.742262,0.835064
5,0001ea8717f6de06,-0.071157,-0.331644,-0.378869,0.910259,-0.179586,-0.427828,-1.589367
6,00024115d4cbde0f,-0.295804,-0.250042,-0.269727,0.028731,1.442812,-0.817858,-0.983259
7,000247e83dcc1211,2.100432,-0.21944,-0.378869,-1.14664,-0.431959,3.51581,1.239136
8,00025358d4737918,-0.251566,0.257628,0.22141,-0.64291,-0.596774,0.445097,0.488717
9,00026d1092fe71cc,-0.572293,-0.21944,-0.106015,1.451548,0.41814,-0.188336,-0.249549


In [13]:
train_feats.describe().round(4).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
profane,159571.0,0.0,1.0,-0.5723,-0.4135,-0.2237,0.0445,23.4822
capital,159571.0,-0.0,1.0,-0.5561,-0.333,-0.2141,-0.0301,10.196
punct,159571.0,-0.0,1.0,-0.4334,-0.3243,-0.2152,0.0304,134.4109
pronoun,159571.0,-0.0,1.0,-1.1466,-0.7353,-0.134,0.4989,8.7265
verb,159571.0,0.0,1.0,-2.4509,-0.5041,-0.0282,0.5157,9.663
adject,159571.0,-0.0,1.0,-1.3379,-0.5847,-0.0826,0.4378,13.2232
deter,159571.0,0.0,1.0,-1.5894,-0.6103,0.0269,0.5989,9.7246


In [14]:
test_feats.describe().round(4).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
profane,153164.0,0.2862,1.4747,-0.5723,-0.3814,-0.1503,0.3186,31.5004
capital,153164.0,0.063,1.2433,-0.5561,-0.3528,-0.2296,-0.0348,10.2155
punct,153164.0,0.0618,1.3605,-0.4334,-0.297,-0.1606,0.085,135.0385
pronoun,153164.0,-0.0797,1.0651,-1.1466,-1.1466,-0.2572,0.4458,9.8235
verb,153164.0,-0.0953,1.1109,-2.4509,-0.6198,-0.0559,0.4858,9.663
adject,153164.0,0.0725,1.1523,-1.3379,-0.6098,-0.0436,0.541,13.2232
deter,153164.0,-0.0034,1.1186,-1.5894,-0.7547,0.038,0.6598,15.3817


In [15]:
train_feats.to_csv('../data/download/train_feats.csv', index=False)
test_feats.to_csv('../data/download/test_feats.csv', index=False)