In [1]:
import pandas as pd
import numpy as np
import pandas_profiling as ppf
import datetime
import string
from collections import Counter

In [55]:
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split,GridSearchCV,cross_val_score,KFold
from sklearn.metrics import log_loss
import xgboost as xgb

In [3]:
stopwords_set = set(stopwords.words('english'))
s = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')

def text_process(comment_column):
    filtered_rows = []
    for rows in comment_column:
        only_words = tokenizer.tokenize(rows)
        no_stopwords = [word for word in only_words if word.lower() not in stopwords_set]
        #stems = [s.stem(word) for word in no_stopwords]
        filtered_rows.append(' '.join(no_stopwords))
    return filtered_rows

def find_badwords(comment_column):
    filtered_rows = []
    for rows in comment_column:
        only_words = tokenizer.tokenize(rows)
        bad = [word for word in only_words if word.lower() in badwords_set]
        total_bad,=np.shape(bad)
        filtered_rows.append(total_bad)
    return filtered_rows

def get_tags(df):
    tags=[]
    alltags=[]
    rows = df.shape[0]
    for r in range(df.index.min(),df.index.max()+1):
        for f in df.columns:
            if df.loc[r,f] == 1:
                tags.append(str(f))
        alltags.append(tags)
        tags=[]
    return alltags

In [4]:
df_train_initial = pd.read_csv('train.csv.zip')
df_test_initial = pd.read_csv('test.csv.zip')
df_sub = pd.read_csv('sample_submission.csv.zip')
df_features=pd.concat([df_train_initial,df_test_initial]).reset_index(drop=True)
df_features['source'] = ''
df_features.loc[0:len(df_train_initial),['source']] = 'train'
df_features.loc[len(df_train_initial):,['source']] = 'test'

initialcols = list(df_train_initial.columns[df_train_initial.dtypes == 'int64'])

punct_set = set(string.punctuation)

badwords = pd.read_csv('badwords.txt',header=None)
badwords.rename(columns={0:'badwords'},inplace=True)
badwords['badwords'] = badwords['badwords'].str.lower()
badwords_set = set(badwords['badwords'].str.replace('*',''))

hatewords = pd.read_csv('hatewords.txt',header=None)
hatewords.rename(columns={0:'hatewords'},inplace=True)
hatewords['hatewords'] = hatewords['hatewords'].str.lower()
hatewords_set = set(hatewords['hatewords'].str.replace('*',''))

No nulls in data set

Indicate whether a row is 'clean' and how many flags it has

In [5]:
df_features['clean'] = df_features.loc[:,initialcols].sum(axis=1).apply(lambda x: 1 if x==0 else 0)
df_features['total_flags'] = df_features.loc[:,initialcols].sum(axis=1)
df_features['comment_length'] = df_features.comment_text.str.len()

In [6]:
word_count = []
comments_processed = text_process(df_features.comment_text)
word_count_proc = []
comment_len_proc = []
words_all_caps = []
punct_count = []
badword_count = []
badword_allcaps = []
most_common_word_count = []
mcw_is_badword = []
hateword_count = []

for row in df_features.comment_text:
    word_count.append(len(row.split()))
    punct_count.append(sum([1 for x in row if x in punct_set]))
    badword_count.append(sum([1 for x in row.lower().split() if x in badwords_set]))
    hateword_count.append(sum([1 for x in row.lower().split() if x in hatewords_set]))
    
for row in comments_processed:
    word_count_proc.append(len(row.split()))
    comment_len_proc.append(len(row))
    words_all_caps.append(np.sum([x.isupper() for x in row.split() if len(x)>1]))
    badword_allcaps.append(np.sum([x.isupper() for x in row.split() if (len(x)>1)&(x.lower() in badwords_set)]))
    q=Counter(tokenizer.tokenize(row))
    if len(q)>0:
        most_common_word_count.append(q.most_common(1)[0][1])
        mcw_is_badword.append(q.most_common(1)[0][0].lower() in badwords_set)
    else:
        most_common_word_count.append(0)
        mcw_is_badword.append(0)    

df_features['comment_word_all_caps_processed'] = words_all_caps
df_features['comment_word_count_processed'] = word_count_proc
df_features['comment_length_processed'] = comment_len_proc
df_features['comment_word_count'] = word_count
df_features['comment_punct_count'] = punct_count
df_features['comment_badword_count'] = badword_count
df_features['comment_badword_all_caps_count'] = badword_allcaps
df_features['comment_most_common_word_count'] = most_common_word_count
df_features['comment_mcw_is_badword'] = mcw_is_badword
df_features['comment_mcw_is_badword'] = df_features.comment_mcw_is_badword.astype(bool)
df_features['comment_hateword_count'] = hateword_count

In [7]:
df_features['mean_word_size_processed'] = df_features.comment_word_count_processed / df_features.comment_length_processed
df_features['mean_word_size'] = df_features.comment_word_count / df_features.comment_length
df_features['punct_pct'] = df_features.comment_punct_count / df_features.comment_length * 100
df_features['badword_pct'] = df_features.comment_badword_count / df_features.comment_word_count_processed * 100
df_features['hateword_pct'] = df_features.comment_hateword_count / df_features.comment_word_count_processed * 100
df_features['all_caps_pct'] = df_features.comment_word_all_caps_processed / df_features.comment_word_count_processed * 100
df_features['all_caps_pct_badwords'] = df_features.comment_badword_all_caps_count / df_features.comment_word_all_caps_processed * 100
df_features['mcw_pct'] = df_features.comment_most_common_word_count/df_features.comment_word_count_processed*100

In [8]:
df_train = df_features[df_features.source == 'train'].copy()
y = df_train_initial[initialcols]
df_train.drop(initialcols,inplace=True,axis=1)
df_train.drop(['comment_text','id','source'],inplace=True,axis=1)
df_train.fillna(0,inplace=True)
df_test = df_features[df_features.source == 'test'].copy()
df_test.fillna(0,inplace=True)
df_test.drop(initialcols,inplace=True,axis=1)
df_test.drop(['comment_text','id','source'],inplace=True,axis=1)

In [9]:
# param_test1 = {
#     'learning_rate': [i/10.0 for i in range(0,5)],
#     'gamma': [i/10.0 for i in range(0,5)],
#     'max_depth':range(3,10,2),
#     'min_child_weight':range(1,6,2)
# }
# {'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3}
# gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(n_estimators=100, subsample=0.8, colsample_bytree=0.8, 
#                                                   objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
#                                                   param_grid = param_test1, scoring='neg_log_loss', cv=5)
# gsearch1.fit(X_train,y_train)
# gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [17]:
pars= {'n_estimators':100, 'subsample':0.8, 'colsample_bytree':0.8,  
                  'objective': 'binary:logistic', 'nthread':4, 'scale_pos_weight':1, 'seed':27, 
                  'gamma':0.1, 'learning_rate':0.1, 'max_depth':3, 'min_child_weight': 3}
kfold = 5

model_preds = {}
model = {}
model_final = {}

In [27]:
def make_test_model(flag,pars,kfold,df_train,y):
    X_train, X_test, y_train, y_test = train_test_split(df_train, y[flag].values, test_size=0.33, random_state=42)

    model_preds[flag] = pd.DataFrame(y_test,columns=['true'],index=X_test.index)
    model_preds[flag][flag] = 0

    model[flag] = xgb.XGBClassifier()
    model[flag].set_params(**pars)
    
    skf = StratifiedKFold(n_splits=kfold, random_state=42)

    for train_index, test_index in skf.split(X_train, y_train):
        X_train_fold, X_valid = X_train.iloc[train_index,:], X_train.iloc[test_index,:]
        y_train_fold, y_valid = y_train[train_index], y_train[test_index]
        model[flag].fit(X_train_fold,y_train_fold)
        preds = model[flag].predict_proba(X_test)[:,1]
        model_preds[flag][flag] += preds/kfold

    print(flag,' model predictions:\n',log_loss(model_preds[flag]['true'],model_preds[flag][flag]))

def make_model(initialcols,pars,df_train,y,df_test):

    for flag in initialcols:
        model_final[flag] = xgb.XGBClassifier()
        model_final[flag].set_params(**pars)

        model_final[flag].fit(df_train,y[flag])
        preds = model_final[flag].predict_proba(df_test)[:,1]
        df_sub[flag] = preds

In [56]:
scaler = MinMaxScaler()
df_train_scaled = scaler.fit_transform(df_train)
df_train_scaled = pd.DataFrame(df_train_scaled,index=df_train.index,columns=df_train.columns)
scaler = MinMaxScaler()
df_test_scaled = scaler.fit_transform(df_test)
df_test_scaled = pd.DataFrame(df_test_scaled,index=df_test.index,columns=df_test.columns)

In [57]:
make_test_model('toxic',pars,kfold,df_train_scaled,y)
make_test_model('severe_toxic',pars,kfold,df_train_scaled,y)
make_test_model('threat',pars,kfold,df_train_scaled,y)
make_test_model('obscene',pars,kfold,df_train_scaled,y)
make_test_model('insult',pars,kfold,df_train_scaled,y)
make_test_model('identity_hate',pars,kfold,df_train_scaled,y)

toxic  model predictions:
 0.0196839104204
severe_toxic  model predictions:
 0.0133197676631
threat  model predictions:
 0.00935788848831
obscene  model predictions:
 0.0238796323996
insult  model predictions:
 0.0270975334964
identity_hate  model predictions:
 0.0170791516161


In [58]:
make_model(initialcols,pars,df_train_scaled,y,df_test_scaled)
df_sub.to_csv('df_sub_'+datetime.datetime.now().strftime('%Y%m%d%I%M')+'.csv',index=False)

In [65]:
q= df_sub[initialcols].apply(lambda x: np.around(x,decimals=6).value_counts())

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0.000124,,1.0,,,,
0.000129,,16439.0,,,,
0.000131,,2.0,,,,
0.000132,,255.0,,,,
0.000133,,1.0,,,,
0.000134,,190.0,,,,
0.000135,,743.0,,,,
0.000137,,317.0,,,,
0.000138,,4.0,,,,
0.000139,,133.0,,,,


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y['toxic'].values, test_size=0.33, random_state=42)

toxic_model_preds = pd.DataFrame(y_test,columns=['true'],index=X_test.index)
toxic_model_preds['toxic'] = 0

xgb.XGBClassifier(n_estimators=100, subsample=0.8, colsample_bytree=0.8,  
                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27, 
                  gamma=0.1, learning_rate=0.1, max_depth= 3, min_child_weight= 3)

kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=42)

for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_valid = X_train.iloc[train_index,:], X_train.iloc[test_index,:]
    y_train_fold, y_valid = y_train[train_index], y_train[test_index]
    toxic_model.fit(X_train_fold,y_train_fold)
    preds = toxic_model.predict_proba(X_test)[:,1]
    toxic_model_preds['toxic'] += preds/kfold

print('toxic_model_preds:\n',log_loss(toxic_model_preds['true'],toxic_model_preds['toxic']))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y['severe_toxic'].values, test_size=0.33, random_state=42)

severe_toxic_model = xgb.XGBClassifier()
severe_toxic_model.fit(X_train,y_train)
severe_toxic_model_preds = pd.DataFrame([y_test,severe_toxic_model.predict_proba(X_test)[:,1]]).T.set_index(X_test.index)
severe_toxic_model_preds.columns=['true','severe_toxic']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y['threat'].values, test_size=0.33, random_state=42)

threat_model = xgb.XGBClassifier()
threat_model.fit(X_train,y_train)
threat_model_preds = pd.DataFrame([y_test,threat_model.predict_proba(X_test)[:,1]]).T.set_index(X_test.index)
threat_model_preds.columns = ['true','threat']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y['obscene'].values, test_size=0.33, random_state=42)

obscene_model = xgb.XGBClassifier()
obscene_model.fit(X_train,y_train)
obscene_model_preds = pd.DataFrame([y_test,obscene_model.predict_proba(X_test)[:,1]]).T.set_index(X_test.index)
obscene_model_preds.columns=['true','obscene']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y['insult'].values, test_size=0.33, random_state=42)

insult_model = xgb.XGBClassifier()
insult_model.fit(X_train,y_train)
insult_model_preds = pd.DataFrame([y_test,insult_model.predict_proba(X_test)[:,1]]).T.set_index(X_test.index)
insult_model_preds = ['true','insult']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y['identity_hate'].values, test_size=0.33, random_state=42)

identity_hate_model = xgb.XGBClassifier()
identity_hate_model.fit(X_train,y_train)
identity_hate_model_preds = pd.DataFrame([y_test,identity_hate_model.predict_proba(X_test)[:,1]]).T.set_index(X_test.index)
identity_hate_model_preds.columns=['true','identity_hate']


In [None]:
print('toxic_model_preds:\n',log_loss(toxic_model_preds['true'],toxic_model_preds['toxic']))
print('severe_toxic_model_preds:\n',log_loss(severe_toxic_model_preds['true'],severe_toxic_model_preds['severe_toxic']))
print('threat_model_preds:\n',log_loss(threat_model_preds['true'],threat_model_preds['threat']))
print('obscene_model_preds:\n',log_loss(obscene_model_preds['true'],obscene_model_preds['obscene']))
print('insult_model_preds:\n',log_loss(insult_model_preds['true'],insult_model_preds['insult']))
print('identity_hate_model_preds:\n',log_loss(identity_hate_model_preds['true'],identity_hate_model_preds['identity_hate']))

In [None]:
a = text_process(df_train_initial.comment_text[0:50000])
b = text_process(df_train_initial.comment_text[50001:100000])
aa = get_tags(df_train_initial[initialcols][0:50000])
bb = get_tags(df_train_initial[initialcols][50001:100000])

In [None]:
# Train and test data. 
train_data = a
train_labels = aa

test_data = b
test_labels = bb

# Representation of the data using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, sublinear_tf=1,
            strip_accents='unicode', analyzer='word',ngram_range=(1,1))
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

# Change the representation of our data as a list of bit lists 
mlb = MultiLabelBinarizer()
binary_train_labels = mlb.fit_transform(train_labels)
binary_test_labels = mlb.transform(test_labels)

print('Binary train labels:\n', binary_train_labels)

# One classifer built per category using a one vs the rest approach
classifier = OneVsRestClassifier(GradientBoostingClassifier())
classifier.fit(vectorised_train_data, binary_train_labels)

#Predict
predictions = classifier.predict_proba(vectorised_test_data)


print('Predictions:\n', predictions)

In [None]:
# precision = precision_score(binary_test_labels, predictions)
# recall = recall_score(binary_test_labels, predictions, average='micro')
# f1 = f1_score(binary_test_labels, predictions, average='micro')
# print("Micro-average quality numbers")
# print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision,
#                                                                      recall,
#                                                                      f1))

precision = precision_score(binary_test_labels, predictions, average='macro')
recall = recall_score(binary_test_labels, predictions, average='macro')
f1 = f1_score(binary_test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision,
                                                                     recall,
                                                                     f1))

In [None]:
# Train and test data. 
train_data = a
train_labels = aa

test_data = b
test_labels = bb

# Representation of the data using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, sublinear_tf=1,
            strip_accents='unicode', analyzer='word',ngram_range=(1,1))
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

# Change the representation of our data as a list of bit lists 
mlb = MultiLabelBinarizer()
binary_train_labels = mlb.fit_transform(train_labels)
binary_test_labels = mlb.transform(test_labels)

print('Binary train labels:\n', binary_train_labels)

# One classifer built per category using a one vs the rest approach
classifier = OneVsRestClassifier(LinearSVC())
classifier.fit(vectorised_train_data, binary_train_labels)

#Predict
predictions = classifier.predict(vectorised_test_data)


print('Predictions:\n', predictions)
print()

print('Predictions inverse:\n', mlb.inverse_transform(predictions))

In [None]:
precision = precision_score(binary_test_labels, predictions, average='micro')
recall = recall_score(binary_test_labels, predictions, average='micro')
f1 = f1_score(binary_test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision,
                                                                     recall,
                                                                     f1))

precision = precision_score(binary_test_labels, predictions, average='macro')
recall = recall_score(binary_test_labels, predictions, average='macro')
f1 = f1_score(binary_test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision,
                                                                     recall,
                                                                     f1))

In [None]:
pd.DataFrame(predictions,columns=mlb.classes_+'pred').merge(df_train[50001:100000].reset_index(),
                                                            left_index=True,right_index=True)\
                                                     .sort_values(by='total_flags',ascending=False)