# 4. Clasificación por Deep Learning

###### En este notebook, se procederá al uso de algoritmos esclusivamente de Deep Learning, mediante los datos que han sido previamente limpiados por "Clean Words". En este encontraremos en primer lugar, la utilización de Cross Validation entre los datos de entrenamiento, y posteriormente se realizará una prediccion sobre los datos de Test. Para poder ser evaluados por Kaggle.  

In [1]:
import pandas as pd
import numpy as np
import re
import time
import ast
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

from gensim import corpora, models, similarities

# Linear Models 
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB

from tqdm import tqdm

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Flatten, LSTM, Input, RNN
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import EarlyStopping



Using TensorFlow backend.


In [2]:
nameTrainCSV = 'trainWithListOfCleanWords'
nameTestCSV = 'testWithListOfCleanWords'

train = pd.read_csv('../data/processed/' + nameTrainCSV + '.csv', encoding='utf-8')
train['BagOfWords'] = dict
train.head(10)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,listOfCleanWords,cleanWordsAsText,BagOfWords
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"['explanation', 'edit', 'make', 'username', 'h...",explanation edit make username hardcore metall...,<class 'dict'>
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"['aww', 'match', 'background', 'colour', 'seem...",aww match background colour seemingly stick th...,<class 'dict'>
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"['hey', 'man', 'really', 'try', 'edit', 'war',...",hey man really try edit war guy constantly rem...,<class 'dict'>
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"['make', 'real', 'suggestions', 'improvement',...",make real suggestions improvement wonder secti...,<class 'dict'>
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"['sir', 'hero', 'chance', 'remember', 'page']",sir hero chance remember page,<class 'dict'>
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0,"['congratulations', 'well', 'use', 'tool', 'we...",congratulations well use tool well talk,<class 'dict'>
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,"['cocksucker', 'piss', 'around', 'work']",cocksucker piss around work,<class 'dict'>
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,"['vandalism', 'matt', 'shirvington', 'article'...",vandalism matt shirvington article revert plea...,<class 'dict'>
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,"['sorry', 'word', 'nonsense', 'offensive', 'an...",sorry word nonsense offensive anyway intend wr...,<class 'dict'>
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,"['alignment', 'subject', 'contrary', 'dulithgow']",alignment subject contrary dulithgow,<class 'dict'>


In [3]:
start = time.time()
for x in tqdm(range(len(train))):
    train.set_value(col='listOfCleanWords',
                index=x,
                value=ast.literal_eval(train["listOfCleanWords"][x]))
    train.set_value(col='cleanWordsAsText',
                index=x,
                value=str(train["cleanWordsAsText"][x]))
    train.set_value(col='BagOfWords',
                index=x,
                value=Counter(train["listOfCleanWords"][x]))
end = time.time()
print(end - start)

100%|██████████| 159571/159571 [00:19<00:00, 8243.06it/s]


19.36321449279785


In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,listOfCleanWords,cleanWordsAsText,BagOfWords
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[explanation, edit, make, username, hardcore, ...",explanation edit make username hardcore metall...,"{'explanation': 1, 'edit': 1, 'make': 1, 'user..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[aww, match, background, colour, seemingly, st...",aww match background colour seemingly stick th...,"{'aww': 1, 'match': 1, 'background': 1, 'colou..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[hey, man, really, try, edit, war, guy, consta...",hey man really try edit war guy constantly rem...,"{'hey': 1, 'man': 1, 'really': 1, 'try': 1, 'e..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"[make, real, suggestions, improvement, wonder,...",make real suggestions improvement wonder secti...,"{'make': 1, 'real': 1, 'suggestions': 1, 'impr..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[sir, hero, chance, remember, page]",sir hero chance remember page,"{'sir': 1, 'hero': 1, 'chance': 1, 'remember':..."


In [5]:
# MULTICLASS PREDICTION

In [6]:
classLabel = {
    0: "neutral",
    1: "toxic",
    2 : "severe_toxic",
    3 : "obscene",
    4 : "threat",
    5 : "insult",
    6 : "identity_hate" 
}


In [7]:
y = np.empty((len(train['cleanWordsAsText']),),dtype=object)
allTextToxicTrain = dict()
for idx in classLabel:
    if classLabel[idx] != "neutral":
        T = np.where(train[classLabel[idx]] == 1)[0]
        allTextToxicTrain[idx] = T
        for i in T:
            if y[i] is None:
                y[i] = [idx]                
            else:
                y[i].append(idx)
indxsOfNeutralTexts = np.where(y == None) 
y[indxsOfNeutralTexts] = [[0]]
indxsOfNeutralTexts = indxsOfNeutralTexts[0]

allTextsNoToxicTrain = [str(train['cleanWordsAsText'][x]) for x in indxsOfNeutralTexts]

idxList = []
for i in allTextToxicTrain.keys():
    #allTextToxicTrain[i] = [str(train['cleanWordsAsText'][j]) for j in allTextToxicTrain[i]]
    idxList = np.unique(np.append(idxList, allTextToxicTrain[i]))
allTextToxicTrain = [str(train['cleanWordsAsText'][j]) for j in idxList]

In [8]:
# Test clasification
test = pd.read_csv('../data/processed/' + nameTestCSV + '.csv', encoding='utf-8')
test.head()


Unnamed: 0,id,comment_text,listOfCleanWords,cleanWordsAsText
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,"['yo', 'bitch', 'ja', 'rule', 'succesful', 'ev...",yo bitch ja rule succesful ever whats hat sad ...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,"['rfc', 'title', 'fine', 'imo']",rfc title fine imo
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...","['source', 'zawe', 'ashton', 'lapland']",source zawe ashton lapland
3,00017563c3f7919a,":If you have a look back at the source, the in...","['look', 'back', 'source', 'information', 'upd...",look back source information update correct fo...
4,00017695ad8997eb,I don't anonymously edit articles at all.,"['anonymously', 'edit', 'article']",anonymously edit article


> Se inicializan las variables de X_train y X_test + Y_train completos a partir de los textos ya limpios, ademas de obtener todos los textos en forma de lista.

In [9]:
allTrainText = [txt if txt is not np.nan else '' for txt in train['cleanWordsAsText']]
allTestText = [txt if txt is not np.nan else '' for txt in test['cleanWordsAsText']]
X_train = allTrainText
X_test = allTestText
yBinary = MultiLabelBinarizer().fit_transform(y)
y_train = yBinary


In [10]:
columns = ["idExp","numFeatures", "algorithm", "Nfolds", "accuaracy", "logloss", "fmeasure"]
dfTestResults = pd.DataFrame(columns=columns)

> Cantidad features a utilizar 

In [11]:
maxFeatures = 100000

## Word Embedings
> En este apartado se diseñan 3 formas de representación de las palabras de los textos, utilizando directamente los textos ya limpiados previamente. 

### Mediante Tokenización de las palabras

In [12]:
# Tokenize

tokenizer = Tokenizer(num_words=maxFeatures)
tokenizer.fit_on_texts(list(allTrainText))
X_train_tokenized_seq = tokenizer.texts_to_sequences(allTrainText)
X_test_tokenized_seq = tokenizer.texts_to_sequences(allTestText)


In [13]:
len(tokenizer.word_index)

156860

In [14]:
X_train_seq = pad_sequences(X_train_tokenized_seq)
X_test_seq = pad_sequences(X_test_tokenized_seq, maxlen=len(X_train_seq[0]))

## Selección de features a utilizar

In [15]:
X_train = X_train_seq
X_test = X_test_seq

## Creación del modelo CNN

In [16]:
# MODEL CNN
numClases = 7

#training params
batch_size = 512 
num_epochs = 8 

#model parameters
num_filters = 128 
weight_decay = 1e-4
outputDim = 100

modelCNN = Sequential()
modelCNN.add(Embedding(input_dim=len(X_train), output_dim=outputDim))
modelCNN.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
modelCNN.add(MaxPooling1D(2))
modelCNN.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
modelCNN.add(GlobalMaxPooling1D())
modelCNN.add(Dropout(0.5))
modelCNN.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
modelCNN.add(Dense(numClases, activation='sigmoid'))  #multi-label (k-hot encoding)

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
modelCNN.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
modelCNN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         15957100  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         89728     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
__________

### Cross Validation CNN

In [17]:
# CV experiments
idExp = 0
numFeatures = maxFeatures

# Cross validation
Nfolds = 2
kf = KFold(n_splits=Nfolds, random_state=True)
kf.get_n_splits(X_train)

name = "CNN"
meanAcc = 0.0
meanLogLoss = 0.0
meanFmeasure = 0.0

batch_size = 64 
num_epochs = 3 

In [18]:
for train_index, test_index in tqdm(kf.split(X_train)):
    X_train_cv = X_train[train_index]
    X_test_cv = X_train[test_index]
    y_train_cv, y_test_cv = yBinary[train_index], yBinary[test_index]

    cnnmModelHist = modelCNN.fit(X_train_cv, y_train_cv, batch_size=batch_size, epochs=num_epochs, validation_split=0.1, shuffle=True, verbose=2)
    predicted = modelCNN.predict(X_test_cv)

    acc = accuracy_score(y_test_cv, predicted.round())
    fmeausre = f1_score(y_test_cv, predicted.round(), labels=[0,1,2,3,4,5,6], average=None)
    logloss = log_loss(y_pred=predicted.round(), y_true=y_test_cv)
    meanAcc += acc
    meanLogLoss += logloss
    meanFmeasure += fmeausre
meanAcc = meanAcc / Nfolds
meanLogLoss = meanLogLoss / Nfolds
meanFmeasure = meanFmeasure / Nfolds
dfTestResults.loc[idExp] = [idExp,maxFeatures,name,Nfolds,meanAcc,meanLogLoss,meanFmeasure]
print(str(idExp))
idExp += 1

In [None]:
#dfTestResults.to_excel('../reports/reportsCNN'+ str(maxFeatures) + '.xls', index=False)

### Predicción sobre test

> En este caso hacemos fitting mediante Tokenización, debido a que para clasificar textos a partir de CNN es la que mejor resultados da. 

In [None]:
#model training
cnnmModelHist = modelCNN.fit(X_train, y_train, batch_size=batch_size, epochs=min(num_epochs, 3), validation_split=0.1, shuffle=True, verbose=2)

Train on 143613 samples, validate on 15958 samples
Epoch 1/3
 - 2197s - loss: 0.1525 - acc: 0.9544 - val_loss: 0.0704 - val_acc: 0.9759
Epoch 2/3
 - 2184s - loss: 0.0611 - acc: 0.9788 - val_loss: 0.0605 - val_acc: 0.9787
Epoch 3/3


In [None]:
predicted = modelCNN.predict(X_test)

> Se almacenan los datos predecidos en formato CSV para poder hacer el submision de test, y poder evaluar el modelo.

In [None]:
columns = ["id","toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
dfTestPredicted = pd.DataFrame(columns=columns)
for x in tqdm(range(len(test))):
    dfTestPredicted.loc[x] = [test['id'][x], predicted[x][1], predicted[x][2], predicted[x][3], predicted[x][4], predicted[x][5], predicted[x][6]]
dfTestPredicted.to_csv('../reports/testPred/predTestCNN_Seq_'+ str(maxFeatures) +'.csv',encoding='utf-8', index=False)

## Creación del modelo LSTM

In [None]:
inp = Input(shape=(len(X_train_seq[0]), ))
embed_size = 128
x = Embedding(maxFeatures, embed_size)(inp)
x = LSTM(90, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(60, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(7, activation="sigmoid")(x)
modelLSTM = Model(inputs=inp, outputs=x)
modelLSTM.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])


### Cross Validation LSTM

In [None]:
dfTestResults = pd.DataFrame(columns=columns)

In [None]:
# CV experiments
idExp = 0
X_train = X_train_seq
X_test = X_test_seq
numFeatures = maxFeatures

# Cross validation
Nfolds = 3
kf = KFold(n_splits=Nfolds, random_state=True)
kf.get_n_splits(X_train)

name = "LSTM"
meanAcc = 0.0
meanLogLoss = 0.0
meanFmeasure = 0.0

batch_size = 64 
num_epochs = 3 

In [None]:
for train_index, test_index in tqdm(kf.split(X_train)):
    X_train_cv = X_train[train_index]
    X_test_cv = X_train[test_index]
    y_train_cv, y_test_cv = yBinary[train_index], yBinary[test_index]

    cnnmModelHist = modelLSTM.fit(X_train_cv, y_train_cv, batch_size=batch_size, epochs=num_epochs, validation_split=0.1, shuffle=True, verbose=2)
    predicted = modelLSTM.predict(X_test_cv)

    acc = accuracy_score(y_test_cv, predicted.round())
    fmeausre = f1_score(y_test_cv, predicted.round(), labels=[0,1,2,3,4,5,6], average=None)
    logloss = log_loss(y_pred=predicted.round(), y_true=y_test_cv)
    meanAcc += acc
    meanLogLoss += logloss
    meanFmeasure += fmeausre
meanAcc = meanAcc / Nfolds
meanLogLoss = meanLogLoss / Nfolds
meanFmeasure = meanFmeasure / Nfolds
dfTestResults.loc[idExp] = [idExp,maxFeatures,name,Nfolds,meanAcc,meanLogLoss,meanFmeasure]
print(str(idExp))
idExp += 1

In [None]:
dfTestResults.to_excel('../reports/reportsCNN'+ str(maxFeatures) + '.xls', index=False)

### Predicción sobre test

En este caso hacemos fitting mediante Tokenización, debido a que para clasificar textos a partir de LSTM es la que mejor resultados da.

In [None]:
modelLSTM.fit(X_train_seq, y_train, batch_size=batch_size, epochs=min(num_epochs,2), validation_split=0.1);

> Se almacenan los datos predecidos en formato CSV para poder hacer el submision de test, y poder evaluar el modelo.

In [None]:
predicted = modelLSTM.predict(X_test_seq, batch_size=1024, verbose=1)
columns = ["id","toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
dfTestPredicted = pd.DataFrame(columns=columns)
for x in tqdm(range(len(test))):
    dfTestPredicted.loc[x] = [test['id'][x], predicted[x][1], predicted[x][2], predicted[x][3], predicted[x][4], predicted[x][5], predicted[x][6]]
dfTestPredicted.to_csv('../reports/testPred/predTestLSTM_Seq_'+ str(maxFeatures) +'.csv',encoding='utf-8', index=False)