# 4. Clasificación por Deep Learning

In [None]:
import pandas as pd
import numpy as np
import re
import time
import ast
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

from gensim import corpora, models, similarities

# Linear Models 
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB

from tqdm import tqdm

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Flatten, LSTM, Input, RNN
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import EarlyStopping



In [None]:
nameTrainCSV = 'trainWithListOfCleanWords'
nameTestCSV = 'testWithListOfCleanWords'

train = pd.read_csv('../data/processed/' + nameTrainCSV + '.csv', encoding='utf-8')
train['BagOfWords'] = dict
train.head(10)


In [None]:
start = time.time()
for x in range(len(train)):
    train.set_value(col='listOfCleanWords',
                index=x,
                value=ast.literal_eval(train["listOfCleanWords"][x]))
    train.set_value(col='BagOfWords',
                index=x,
                value=Counter(train["listOfCleanWords"][x]))
end = time.time()
print(end - start)

In [None]:
train.head()

In [None]:
# MULTICLASS PREDICTION

In [None]:
classLabel = {
    0: "neutral",
    1: "toxic",
    2 : "severe_toxic",
    3 : "obscene",
    4 : "threat",
    5 : "insult",
    6 : "identity_hate" 
}


In [None]:
y = np.empty((len(train['cleanWordsAsText']),),dtype=object)
allTextToxicTrain = dict()
for idx in classLabel:
    if classLabel[idx] != "neutral":
        T = np.where(train[classLabel[idx]] == 1)[0]
        allTextToxicTrain[idx] = T
        for i in T:
            if y[i] is None:
                y[i] = [idx]                
            else:
                y[i].append(idx)
indxsOfNeutralTexts = np.where(y == None) 
y[indxsOfNeutralTexts] = [[0]]
indxsOfNeutralTexts = indxsOfNeutralTexts[0]

allTextsNoToxicTrain = [str(train['cleanWordsAsText'][x]) for x in indxsOfNeutralTexts]

idxList = []
for i in allTextToxicTrain.keys():
    #allTextToxicTrain[i] = [str(train['cleanWordsAsText'][j]) for j in allTextToxicTrain[i]]
    idxList = np.unique(np.append(idxList, allTextToxicTrain[i]))
allTextToxicTrain = [str(train['cleanWordsAsText'][j]) for j in idxList]

In [None]:
# Test clasification
test = pd.read_csv('../data/processed/' + nameTestCSV + '.csv', encoding='utf-8')
test.head()


> Se inicializan las variables de X_train y X_test + Y_train completos a partir de los textos ya limpios, ademas de obtener todos los textos en forma de lista.

In [None]:
allTrainText = [txt if txt is not np.nan else '' for txt in train['cleanWordsAsText']]
allTestText = [txt if txt is not np.nan else '' for txt in test['cleanWordsAsText']]
X_train = allTrainText
X_test = allTestText
yBinary = MultiLabelBinarizer().fit_transform(y)
y_train = yBinary


In [None]:
columns = ["idExp","numFeatures", "algorithm", "Nfolds", "accuaracy", "logloss", "fmeasure"]
dfTestResults = pd.DataFrame(columns=columns)

> Cantidad features a utilizar 

In [None]:
maxFeatures = 100000

## Word Embedings
> En este apartado se diseñan 3 formas de representación de las palabras de los textos, utilizando directamente los textos ya limpiados previamente. 

### Mediante vector TFID

In [None]:
# Vectorizer all text
tfidV = TfidfVectorizer(ngram_range=(1,6), max_features=maxFeatures)
X_train_tfid = tfidV.fit_transform(allTrainText)

# Fit all clasificators with tfid matrix
numFeatures = len(tfidV.get_feature_names())
print("NFeatures = " + str(numFeatures))
tfidVTest = TfidfVectorizer(vocabulary=tfidV.get_feature_names())
X_test_tfid = tfidVTest.fit_transform(X_test)

### Mediante BOW

In [None]:
# BOW
bowFeatures = CountVectorizer(vocabulary=tfidV.get_feature_names())
X_train_bow = bowFeatures.fit_transform(allTrainText)

X_test_bow = bowFeatures.fit_transform(allTestText)

### Mediante Tokenización de las palabras

In [None]:
# Tokenize

tokenizer = Tokenizer(num_words=maxFeatures)
tokenizer.fit_on_texts(list(allTrainText))
X_train_tokenized_seq = tokenizer.texts_to_sequences(allTrainText)
X_test_tokenized_seq = tokenizer.texts_to_sequences(allTestText)


In [None]:
len(tokenizer.word_index)

In [None]:
X_train_seq = pad_sequences(X_train_tokenized_seq)
X_test_seq = pad_sequences(X_test_tokenized_seq, maxlen=len(X_train_seq[0]))

## Creación del modelo CNN

In [None]:
# MODEL CNN
numClases = 7

#training params
batch_size = 512 
num_epochs = 8 

#model parameters
num_filters = 128 
weight_decay = 1e-4
outputDim = 100

model = Sequential()
model.add(Embedding(input_dim=len(X_train), output_dim=outputDim))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(numClases, activation='sigmoid'))  #multi-label (k-hot encoding)

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

> En este caso hacemos fitting mediante Tokenización, debido a que para clasificar textos a partir de CNN es la que mejor resultados da. 

In [None]:
#model training
cnnmModelHist = model.fit(X_train_seq, y_train, batch_size=batch_size, epochs=num_epochs, validation_split=0.1, shuffle=True, verbose=2)

In [None]:
predicted = model.predict(X_test_seq)

> Se almacenan los datos predecidos en formato CSV para poder hacer el submision de test, y poder evaluar el modelo.

In [None]:
columns = ["id","toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
dfTestPredicted = pd.DataFrame(columns=columns)
for x in tqdm(range(len(test))):
    dfTestPredicted.loc[x] = [test['id'][x], predicted[x][1], predicted[x][2], predicted[x][3], predicted[x][4], predicted[x][5], predicted[x][6]]
dfTestPredicted.to_csv('../reports/testPred/predTestCNN_Seq_'+ str(maxFeatures) +'.csv',encoding='utf-8', index=False)

## Creación del modelo LSTM

In [None]:
inp = Input(shape=(len(X_train_seq[0]), ))
embed_size = 128
x = Embedding(maxFeatures, embed_size)(inp)
x = LSTM(90, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(60, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(7, activation="sigmoid")(x)
modelLSTM = Model(inputs=inp, outputs=x)
modelLSTM.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])


En este caso hacemos fitting mediante Tokenización, debido a que para clasificar textos a partir de LSTM es la que mejor resultados da.

In [None]:
modelLSTM.fit(X_train_seq, y_train, batch_size=batch_size, epochs=min(num_epochs,2), validation_split=0.1);

> Se almacenan los datos predecidos en formato CSV para poder hacer el submision de test, y poder evaluar el modelo.

In [None]:
predicted = modelLSTM.predict(X_test_seq, batch_size=1024, verbose=1)
columns = ["id","toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
dfTestPredicted = pd.DataFrame(columns=columns)
for x in tqdm(range(len(test))):
    dfTestPredicted.loc[x] = [test['id'][x], predicted[x][1], predicted[x][2], predicted[x][3], predicted[x][4], predicted[x][5], predicted[x][6]]
dfTestPredicted.to_csv('../reports/testPred/predTestLSTM_Seq_'+ str(maxFeatures) +'.csv',encoding='utf-8', index=False)