In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration
---

## Importando o DataSet

In [None]:
df = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")

In [None]:
df.head(20)

In [None]:
df.tail()

In [None]:
df.shape

## Verificando se há dados nulos ou vazios

In [None]:
df.isnull().sum()

## Verificar quantas e quais as línguas presentes DF

In [None]:
print(
    f"number of languages: {len(list(df.language.unique()))}\n\n"
    f"{list(df.language.unique())}")


## Relacionando a quantidade de cada língua dentro do DF

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

y = df["language"].value_counts()

plt.figure(figsize=(15,5))
sns.barplot(x=y.index, y=y)
plt.show()

In [None]:
label, frequencies = np.unique(df.language, return_counts=True)
plt.figure(figsize=(10,10))
plt.pie(frequencies, labels=label, autopct='%.1f%%')
plt.show()

### OBS:
Assim pode-se observar que o dataset é majoritariamente composto por premissas e hipóteses na língua inglesas enquanto que o restante das frases são igualmente distribuídas entre as outras 14 línguas.

## Relacionando a quantidade de labels em cada língua

In [None]:
pd.options.plotting.backend = "matplotlib"
df.label.hist()
plt.show()

In [None]:
import seaborn as sns
sns.catplot(data=df, x="label", col="lang_abv", col_wrap=5, kind="count")
plt.show()

### OBS:
As labels no total são levemente desbalanceadas.Esse desbalaço está presente em todos os idiomas. 

# Pre-Processamento
---

In [None]:
from sklearn.model_selection import train_test_split

df = df[["premise", "hypothesis", "lang_abv", "label"]]
X = df.iloc[:, :-1]
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  

## Bag of Words:

* ### Separando premissas e hipoteses por idioma:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

def preprossecing1(train, test, train_label, test_label, lan):
    
    vec_p = TfidfVectorizer()
    vec_h = TfidfVectorizer()
    pca = PCA(0.9)
    normalizer = Normalizer(copy=False) # para aplicar método PCA normalização dos dados é reconmendável
    pipe = make_pipeline(normalizer, pca)

    # filtro de linguagem
    index_train = train[train.lang_abv==lan].index
    train_label_final = train_label.loc[index_train]
    
    index_test = test[test.lang_abv==lan].index
    test_label_final = test_label.loc[index_test]
    
    # premissas
    train_p = vec_p.fit_transform(train[train.lang_abv==lan].premise)
    df_train_p = pd.DataFrame.sparse.from_spmatrix(train_p, columns=["p_"+k for k in vec_p.get_feature_names()])
    
    test_p = vec_p.transform(test[test.lang_abv==lan].premise)
    df_test_p = pd.DataFrame.sparse.from_spmatrix(test_p, columns=["p_"+k for k in vec_p.get_feature_names()])

    #hipotesis
    train_h = vec_h.fit_transform(train[train.lang_abv==lan].hypothesis)
    df_train_h = pd.DataFrame.sparse.from_spmatrix(train_h, columns=["h_"+k for k in vec_h.get_feature_names()])
    
    test_h = vec_h.transform(test[test.lang_abv==lan].hypothesis)
    df_test_h = pd.DataFrame.sparse.from_spmatrix(test_h, columns=["h_"+k for k in vec_h.get_feature_names()])
    
    #concatenando..
    train_final = pd.concat([df_train_p, df_train_h], axis=1)
    test_final = pd.concat([df_test_p, df_test_h], axis=1)
    
    train_final = pipe.fit_transform(train_final.to_numpy())
    test_final = pipe.transform(test_final.to_numpy())

    return train_final, test_final, train_label_final, test_label_final

* ### Agrupando premissa e hipótese por idioma

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

def preprossecing2(train, test, train_label, test_label, lan):
    
    vec = TfidfVectorizer()
    pca = PCA(0.9)
    normalizer = Normalizer(copy=False) # para aplicar método PCA normalização dos dados é reconmendável
    pipe = make_pipeline(normalizer, pca)

    # filtro de linguagem
    index_train = train[train.lang_abv==lan].index
    train_label_final = train_label.loc[index_train]
    
    index_test = test[test.lang_abv==lan].index
    test_label_final = test_label.loc[index_test]
    
    #juntando premissas e hipotesis
    train_text = [p+" "+h for p, h in zip(train[train.lang_abv==lan].premise, train[train.lang_abv==lan].hypothesis)]
    
    test_text = [p+" "+h for p, h in zip(test[test.lang_abv==lan].premise, test[test.lang_abv==lan].hypothesis)]
    

    #tfidvec
    train_h = vec.fit_transform(train_text)
    train_final = pd.DataFrame.sparse.from_spmatrix(train_h, columns=[k for k in vec.get_feature_names()])
    
    test_h = vec.transform(test_text)
    test_final = pd.DataFrame.sparse.from_spmatrix(test_h, columns=[k for k in vec.get_feature_names()])
    
    
    train_final = pipe.fit_transform(train_final.to_numpy())
    test_final = pipe.transform(test_final.to_numpy())

    return train_final, test_final, train_label_final, test_label_final

# Modelo de Machine Learning e Validação:
---

## Random Forest Classifier:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Separando premissas e hipoteses por idioma
def predictRFC(X_train, X_test, y_train, y_test, inteiro):
    for lan in X_train.lang_abv.unique():
            
        rfc = RandomForestClassifier(n_estimators=1000, max_depth=2)

        if inteiro==0:
            train, test, train_label, test_label = preprossecing1(X_train, y_train, X_test, y_test, lan)
        else:
            train, test, train_label, test_label = preprossecing2(X_train, y_train, X_test, y_test, lan)
            
        fig, axs = plt.subplots(1,3, figsize=(15,6))
        fig.suptitle(f"Language: {lan}")
        for n in [0,1,2]:
            rfc.fit(train, train_label==n)
            pred = rfc.predict(test)

            accuracy = metrics.accuracy_score(pred, test_label==n)
            v = metrics.v_measure_score(pred, test_label==n)
            
            cm = confusion_matrix(test_label==n, pred, normalize="true")
            axs[n].imshow(cm)
            axs[n].set_xticks(np.arange(2))
            axs[n].set_yticks(np.arange(2))
            axs[n].set_xticklabels(["False", "True"])
            axs[n].set_yticklabels(["False", "True"])
            axs[n].set_title(f"Label {n}\naccuracy: {accuracy:.3f}\nv_score: {v:.3f}")
            for i in range(2):
                for j in range(2):
                    text = axs[n].text(j, i, cm[i, j],
                                   ha="center", va="center", color="k")
            
        plt.show() 

In [None]:
# separado
# predictRFC(X_train, y_train, X_test, y_test, 0)

In [None]:
# agrupado
predictRFC(X_train, y_train, X_test, y_test, 1)

## XGBoost:

In [None]:
import xgboost as xgb

def predictXGB(X_train, X_test, y_train, y_test, inteiro):
    for lan in X_train.lang_abv.unique():
            
        xgboost = xgb.XGBClassifier(n_estimators=500, max_detph=2, scale_pos_weight=10)

        if inteiro==0:
            train, test, train_label, test_label = preprossecing1(X_train, y_train, X_test, y_test, lan)
        else:
            train, test, train_label, test_label = preprossecing2(X_train, y_train, X_test, y_test, lan)
    
        fig, axs = plt.subplots(1,3, figsize=(15,8))
        fig.suptitle(f"Language: {lan}")
        for n in [0,1,2]:
            xgboost.fit(train, train_label==n)
            pred = xgboost.predict(test)

            accuracy = metrics.accuracy_score(pred, test_label==n)
            v = metrics.v_measure_score(pred, test_label==n)
            
            cm = confusion_matrix(test_label==n, pred, normalize="true")
            axs[n].imshow(cm)
            axs[n].set_xticks(np.arange(2))
            axs[n].set_yticks(np.arange(2))
            axs[n].set_xticklabels(["False", "True"])
            axs[n].set_yticklabels(["False", "True"])
            axs[n].set_title(f"Label {n}\naccuracy: {accuracy:.3f}\nv_score: {v:.3f}")
            for i in range(2):
                for j in range(2):
                    text = axs[n].text(j, i, cm[i, j],
                                   ha="center", va="center", color="k")
            
        plt.show() 

In [None]:
# separado
# predictXGB(X_train, y_train, X_test, y_test, 0)

In [None]:
# agrupado
predictXGB(X_train, y_train, X_test, y_test, 1)

# Conclusão
---

## Modelos
Apesar dos bons resultados de acurácia do RandomForest, o resultado não é confiável uma vez que o modelo gera a mesma resposta independente da label ou do idioma. Já com o método XGBosst, obteve-se resultado mais consistente com variação das predições em função tanto da label quanto do idioma.

## Abordagem
A aboradagem utilizada foi o 'bag of words', no qual considera-se apenas a frequência das palavras presentes, transformando textos em vetores. Entretanto, abordagens com 'word embeddings' aparentemente seriam melhores uma vez que ela considera o valor do textos, permintindo estabelecer comparações entre elas, o que, neste caso, seria uma ótima alternativa.