# 3. Clasificación por machine learning

###### En este notebook, se procederá al uso de algoritmos esclusivamente de machine learning, mediante los datos que han sido previamente limpiados por "Clean Words". En este encontraremos en primer lugar, la realización de Cross Validation entre los datos de entrenamiento, y posteriormente se realizará una prediccion sobre los datos de Test. Para poder ser evaluados por Kaggle.  

In [None]:
import pandas as pd
import numpy as np
import re
import time
import ast
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

from gensim import corpora, models, similarities

# Linear Models 
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from tqdm import tqdm


In [None]:
nameTrainCSV = 'trainWithListOfCleanWords'
nameTestCSV = 'testWithListOfCleanWords'

train = pd.read_csv('../data/processed/' + nameTrainCSV + '.csv', encoding='utf-8')
train['BagOfWords'] = dict
train.head(10)


In [None]:
start = time.time()
for x in range(len(train)):
    train.set_value(col='listOfCleanWords',
                index=x,
                value=ast.literal_eval(train["listOfCleanWords"][x]))
    train.set_value(col='cleanWordsAsText',
                index=x,
                value=str(train["cleanWordsAsText"][x]))
    train.set_value(col='BagOfWords',
                index=x,
                value=Counter(train["listOfCleanWords"][x]))
end = time.time()
print(end - start)

In [None]:
train.head()

In [None]:
# MULTICLASS PREDICTION

In [None]:
classLabel = {
    0: "neutral",
    1: "toxic",
    2 : "severe_toxic",
    3 : "obscene",
    4 : "threat",
    5 : "insult",
    6 : "identity_hate" 
}


In [None]:
# Get texts in Toxic and No Toxic
y = np.empty((len(train['cleanWordsAsText']),),dtype=object)
allTextToxicTrain = dict()
for idx in classLabel:
    if classLabel[idx] != "neutral":
        T = np.where(train[classLabel[idx]] == 1)[0]
        allTextToxicTrain[idx] = T
        for i in T:
            if y[i] is None:
                y[i] = [idx]                
            else:
                y[i].append(idx)
indxsOfNeutralTexts = np.where(y == None) 
y[indxsOfNeutralTexts] = [[0]]
indxsOfNeutralTexts = indxsOfNeutralTexts[0]

allTextsNoToxicTrain = [str(train['cleanWordsAsText'][x]) for x in indxsOfNeutralTexts]

idxList = []
for i in allTextToxicTrain.keys():
    #allTextToxicTrain[i] = [str(train['cleanWordsAsText'][j]) for j in allTextToxicTrain[i]]
    idxList = np.unique(np.append(idxList, allTextToxicTrain[i]))
allTextToxicTrain = [str(train['cleanWordsAsText'][j]) for j in idxList]

In [None]:
# Test clasification
test = pd.read_csv('../data/processed/' + nameTestCSV + '.csv', encoding='utf-8')
test.head()

> Se inicializan las variables de X_train y X_test + Y_train completos a partir de los textos ya limpios, ademas de obtener todos los textos en forma de lista.

In [None]:
allTrainText = [txt if txt is not np.nan else '' for txt in train['cleanWordsAsText']]
allTestText = [txt if txt is not np.nan else '' for txt in test['cleanWordsAsText']]
X_train = allTrainText
X_test = allTestText
yBinary = MultiLabelBinarizer().fit_transform(y)
y_train = yBinary

In [None]:
columns = ["idExp","numFeatures", "algorithm", "Nfolds", "accuaracy", "logloss", "fmeasure"]
dfTestResults = pd.DataFrame(columns=columns)

## Clasificadores Machine Learning

> Inicializamos los clasificadores con las variables a utilizar de cada uno de ellos. Los clasificadores a utilizar són:
> - Regresión Logistica (LR)
> - Máquinas de soporte vectorial con un kernel lineal (Linear SVC)
> - Naive Bayes (NB)
> - Stochastic Gradient Descent (SGD)
> - Árbol de decisión (DT)


In [None]:
#Classifiers

# Logistic Regresion
clfLR = OneVsRestClassifier(LogisticRegression(C=5))

# Linear SVC
clfLSVC = OneVsRestClassifier(LinearSVC(C=5))

# Naive Bayes
clfNB = OneVsRestClassifier(MultinomialNB())

# SGD
clfSGD = OneVsRestClassifier(SGDClassifier(loss="log"))

# Decision Tree
clfDT = OneVsRestClassifier(DecisionTreeClassifier(max_depth=8))





## Word Embedings
> En este apartado se diseñan 2 formas de representación de las palabras de los textos, utilizando directamente los textos ya limpiados previamente.

### Mediante vector TFID

In [None]:
# Vectorizer all text
maxF = 158627
tdifV = TfidfVectorizer(ngram_range=(1,6), max_features=maxF)
X_train_tdif = tdifV.fit_transform(allTrainText)

numFeaturesTFID = len(tdifV.get_feature_names())
print("NFeatures = " + str(numFeaturesTFID))
tdifVTest = TfidfVectorizer(vocabulary=tdifV.get_feature_names())
X_test_tdif = tdifVTest.fit_transform(X_test)

### Mediante BOW

In [None]:
# BOW
bowFeatures = CountVectorizer(vocabulary=tdifV.get_feature_names())
X_train_bow = bowFeatures.fit_transform(allTrainText)
X_test_bow = bowFeatures.fit_transform(allTestText)


## Cross Validation
> En este apartado se indicarán los parametros a tener en cuenta en los experimentos de train haciendo uso de CV.

In [None]:
# Cross validation
Nfolds = 5
kf = KFold(n_splits=Nfolds, random_state=True)
kf.get_n_splits(X_train_tdif)

> Para tener un seguimiento de los experimentos realizados, se dispondrá de un id de experimento, para númerar el experimento a realizar, que posteriormente estos serán guardados en formato excel.
> Para la asignación X_train, está podrá ser asignada por cualquiera de los 2 word embedings creados previamente ya sea TFID o BOW. Además se deberá asignar el numFeatures del word embeding utilizado.

In [None]:
# CV experiments
idExp = 0
X_train = X_train_tdif
X_test = X_test_tdif
numFeatures = numFeaturesTFID

In [None]:
# LINEAR SVC CV exp
name = "Linear SVC"
meanAcc = 0.0
meanLogLoss = 0.0
meanFmeasure = 0.0

for train_index, test_index in kf.split(X_train):
    X_train_cv = X_train[train_index]
    X_test_cv = X_train[test_index]
    y_train_cv, y_test_cv = yBinary[train_index], yBinary[test_index]

    clfLSVC.fit(X_train_cv, y_train_cv)
    predicted = clfLSVC.predict(X_test_cv)
    acc = accuracy_score(y_test_cv, predicted)
    fmeausre = f1_score(y_test_cv, predicted, labels=[0,1,2,3,4,5,6], average=None)
    logloss = log_loss(y_pred=predicted, y_true=y_test_cv)
    meanAcc += acc
    meanLogLoss += logloss
    meanFmeasure += fmeausre
meanAcc = meanAcc / Nfolds
meanLogLoss = meanLogLoss / Nfolds
meanFmeasure = meanFmeasure / Nfolds
dfTestResults.loc[idExp] = [idExp,len(tdifV.get_feature_names()),name,Nfolds,meanAcc,meanLogLoss,meanFmeasure]
print(str(idExp))
idExp += 1

In [None]:
# Logistic Regresion CV exp
name = "Logistic Regresion"
meanAcc = 0.0
meanLogLoss = 0.0
meanFmeasure = 0.0

for train_index, test_index in kf.split(X_train):
    X_train_cv = X_train[train_index]
    X_test_cv = X_train[test_index]
    y_train_cv, y_test_cv = yBinary[train_index], yBinary[test_index]

    clfLR.fit(X_train_cv, y_train_cv)
    predicted = clfLR.predict(X_test_cv)
    acc = accuracy_score(y_test_cv, predicted)
    fmeausre = f1_score(y_test_cv, predicted, labels=[0,1,2,3,4,5,6], average=None)
    logloss = log_loss(y_pred=predicted, y_true=y_test_cv)
    meanAcc += acc
    meanLogLoss += logloss
    meanFmeasure += fmeausre
meanAcc = meanAcc / Nfolds
meanLogLoss = meanLogLoss / Nfolds
meanFmeasure = meanFmeasure / Nfolds
dfTestResults.loc[idExp] = [idExp,len(tdifV.get_feature_names()),name,Nfolds,meanAcc,meanLogLoss,meanFmeasure]
print(str(idExp))
idExp += 1

In [None]:
# SGD CV exp
name = "SGD"
meanAcc = 0.0
meanLogLoss = 0.0
meanFmeasure = 0.0

for train_index, test_index in kf.split(X_train):
    X_train_cv = X_train[train_index]
    X_test_cv = X_train[test_index]
    y_train_cv, y_test_cv = yBinary[train_index], yBinary[test_index]

    clfSGD.fit(X_train_cv, y_train_cv)
    predicted = clfSGD.predict(X_test_cv)
    acc = accuracy_score(y_test_cv, predicted)
    fmeausre = f1_score(y_test_cv, predicted, labels=[0,1,2,3,4,5,6], average=None)
    logloss = log_loss(y_pred=predicted, y_true=y_test_cv)
    meanAcc += acc
    meanLogLoss += logloss
    meanFmeasure += fmeausre
meanAcc = meanAcc / Nfolds
meanLogLoss = meanLogLoss / Nfolds
meanFmeasure = meanFmeasure / Nfolds
dfTestResults.loc[idExp] = [idExp,len(tdifV.get_feature_names()),name,Nfolds,meanAcc,meanLogLoss,meanFmeasure]
print(str(idExp))
idExp += 1

In [None]:
# NB CV exp
name = "NB"
meanAcc = 0.0
meanLogLoss = 0.0
meanFmeasure = 0.0

for train_index, test_index in kf.split(X_train):
    X_train_cv = X_train[train_index]
    X_test_cv = X_train[test_index]
    y_train_cv, y_test_cv = yBinary[train_index], yBinary[test_index]

    clfNB.fit(X_train_cv, y_train_cv)
    predicted = clfNB.predict(X_test_cv)
    acc = accuracy_score(y_test_cv, predicted)
    fmeausre = f1_score(y_test_cv, predicted, labels=[0,1,2,3,4,5,6], average=None)
    logloss = log_loss(y_pred=predicted, y_true=y_test_cv)
    meanAcc += acc
    meanLogLoss += logloss
    meanFmeasure += fmeausre
meanAcc = meanAcc / Nfolds
meanLogLoss = meanLogLoss / Nfolds
meanFmeasure = meanFmeasure / Nfolds
dfTestResults.loc[idExp] = [idExp,len(tdifV.get_feature_names()),name,Nfolds,meanAcc,meanLogLoss,meanFmeasure]
print(str(idExp))
idExp += 1

In [None]:
# Decision Tree CV exp
name = "Decision Tree"
meanAcc = 0.0
meanLogLoss = 0.0
meanFmeasure = 0.0

for train_index, test_index in kf.split(X_train):
    X_train_cv = X_train[train_index]
    X_test_cv = X_train[test_index]
    y_train_cv, y_test_cv = yBinary[train_index], yBinary[test_index]

    clfDT.fit(X_train_cv, y_train_cv)
    predicted = clfDT.predict(X_test_cv)
    acc = accuracy_score(y_test_cv, predicted)
    fmeausre = f1_score(y_test_cv, predicted, labels=[0,1,2,3,4,5,6], average=None)
    logloss = log_loss(y_pred=predicted, y_true=y_test_cv)
    meanAcc += acc
    meanLogLoss += logloss
    meanFmeasure += fmeausre
meanAcc = meanAcc / Nfolds
meanLogLoss = meanLogLoss / Nfolds
meanFmeasure = meanFmeasure / Nfolds
dfTestResults.loc[idExp] = [idExp,len(tdifV.get_feature_names()),name,Nfolds,meanAcc,meanLogLoss,meanFmeasure]
print(str(idExp))
idExp += 1

In [None]:
dfTestResults.to_excel('../reports/reports'+ str(maxF) +'.xls', index=False)

## Creación de submisions de TEST

In [None]:
# Get submision from test
def getCSVSubmision(prediction, name):
    columns = ["id","toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    dfTestPredicted = pd.DataFrame(columns=columns)
    for x in tqdm(range(len(test))):
        dfTestPredicted.loc[x] = [test['id'][x], prediction[x][1], prediction[x][2], prediction[x][3], prediction[x][4], prediction[x][5], prediction[x][6]]

    dfTestPredicted.to_csv('../reports/testPred/'+ str(name) +'.csv',encoding='utf-8', index=False)


### Creación de pruebas para submision de test mediante TFID


In [None]:
X_train = X_train_tdif
X_test = X_test_tdif
numFeatures = numFeaturesTFID

In [None]:
# Fit all clasificators with TFID matrix
clfLSVC.fit(X_train, y_train)
clfLR.fit(X_train, y_train)
clfSGD.fit(X_train, y_train)
clfNB.fit(X_train, y_train)
clfDT.fit(X_train, y_train)


In [None]:
addInfo = str(numFeatures)

In [None]:
getCSVSubmision(prediction=clfLSVC.predict(X_test),name="LSVC_TFID" + addInfo)

In [None]:
getCSVSubmision(prediction=clfLR.predict_proba(X_test), name="LR_TFID" + addInfo)

In [None]:
getCSVSubmision(prediction=clfSGD.predict_proba(X_test), name="SGD_TFID" + addInfo)

In [None]:
getCSVSubmision(prediction=clfNB.predict_proba(X_test), name="NB_TFID" + addInfo)

In [None]:
getCSVSubmision(prediction=clfDT.predict_proba(X_test), name="DT_TFID" + addInfo)

### Creación de pruebas para submision de test mediante TF

In [None]:
X_train = X_train_bow
X_test = X_test_bow

In [None]:
# Fit all clasificators with TD matrix
print("NFeatures = " + str(len(tdifV.get_feature_names())))
tdifVTest = TfidfVectorizer(vocabulary=tdifV.get_feature_names())
X_test_tdif = tdifVTest.fit_transform(X_test)

clfLSVC.fit(X_train, y_train)
clfLR.fit(X_train, y_train)
clfSGD.fit(X_train, y_train)
clfNB.fit(X_train, y_train)
clfDT.fit(X_train, y_train)


In [None]:
getCSVSubmision(prediction=clfLSVC.predict(X_test),name="LSVC_TD" + addInfo)

In [None]:
getCSVSubmision(prediction=clfLR.predict_proba(X_test), name="LR_TD" + addInfo)

In [None]:
getCSVSubmision(prediction=clfSGD.predict_proba(X_test), name="SGD_TD" + addInfo)

In [None]:
getCSVSubmision(prediction=clfNB.predict_proba(X_test), name="NB_TD" + addInfo)

In [None]:
getCSVSubmision(prediction=clfDT.predict_proba(X_test), name="DT_TD" + addInfo)