### Imports

In [93]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score, recall_score, precision_score, f1_score,
                             confusion_matrix, roc_auc_score, roc_curve, RocCurveDisplay)

from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import copy as copy
import unidecode

import nltk
from nltk import tokenize
from string import punctuation

### 1. Carregar os dados

In [94]:
df = pd.read_csv('text.csv')

df = df[(df['label'] == 0) | (df['label'] == 1)].copy().reset_index(drop=True)
df = df.drop(columns=df.columns[0], axis=1)

df.head()

Unnamed: 0,text,label
0,ive enjoyed being able to slouch about relax a...,0.0
1,i dont know i feel so lost,0.0
2,i was beginning to feel quite disheartened,0.0
3,i fear that they won t ever feel that deliciou...,1.0
4,i can still lose the weight without feeling de...,0.0


### Pré-Processamento

In [95]:
def df_lower(df_column):
    return df_column.apply(lambda x : x.lower())

def remove_accents(df_column):
    return [unidecode.unidecode(text) for text in df_column]

def remove_stopword(df, column_name):
    stopwords = nltk.corpus.stopwords.words("english")
    tokenizer = tokenize.WordPunctTokenizer()
    
    for ponto in punctuation:
        stopwords.append(ponto)
    
    frase_processada = list()
    for review in df[column_name]:
      nova_frase = list()
      palavras_texto = tokenizer.tokenize(review)
      for palavra in palavras_texto:
          if palavra not in stopwords:
              nova_frase.append(palavra)
      frase_processada.append(' '.join(nova_frase))

    return frase_processada

df.text = df_lower(df.text)
df.text = remove_accents(df.text)
df.text = remove_stopword(df, 'text')
df.head()

Unnamed: 0,text,label
0,ive enjoyed able slouch relax unwind frankly n...,0.0
1,dont know feel lost,0.0
2,beginning feel quite disheartened,0.0
3,fear ever feel delicious excitement christmas ...,1.0
4,still lose weight without feeling deprived,0.0


### 2. Separar entre treino e teste

In [96]:
x = df['text']
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, test_size=0.2, stratify=y)

### 3. Criar validação cruzada com o conjunto de treino + Criar as features com TD-IDF

In [97]:
n_splits = 5
kfolds = KFold(n_splits = n_splits)
n_neighbors = list(range(5, 51, 5))

In [98]:
# Functions

def define_kfold_subsets(x_train, y_train, train_idx, val_idx):
    # Train subset
    x_train_kfold = x_train.iloc[train_idx]
    x_val_kfold = x_train.iloc[val_idx]
    
    # Validation subset
    y_train_kfold = y_train.iloc[train_idx]
    y_val_kfold = y_train.iloc[val_idx]

    return x_train_kfold, x_val_kfold, y_train_kfold, y_val_kfold
    

def vectorizer(x_train_kfold, x_val_kfold):
    tfidf = TfidfVectorizer(min_df = 10, max_features = 5000)
    bow_train = tfidf.fit_transform(x_train_kfold)
    bow_val = tfidf.transform(x_val_kfold)

    return tfidf, bow_train, bow_val
    

def score(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "cm": confusion_matrix(y_true, y_pred),
    }

In [99]:
results = []

for nn in tqdm(n_neighbors):

    results_fold = []
    # CREATE KNN
    model = KNeighborsClassifier(n_neighbors = nn)
    
    for fold, (train_idx, val_idx) in tqdm(enumerate(kfolds.split(y_train)), total = n_splits):
        # Define subsets
        x_train_kfold, x_val_kfold, y_train_kfold, y_val_kfold = define_kfold_subsets(x_train, y_train, train_idx, val_idx)
        
        # Vectorizer subsets
        tfidf, bow_train, bow_val = vectorizer(x_train_kfold, x_val_kfold)
        
        # Train
        model.fit(bow_train, y_train_kfold)

        # Assess Performance
        y_train_kfold_pred = model.predict(bow_train)
        y_val_kfold_pred = model.predict(bow_val)

        results_fold.append({
            "tfidf": copy.deepcopy(tfidf),
            "fold": fold,
            "train": score(y_train_kfold, y_train_kfold_pred),
            "val": score(y_val_kfold, y_val_kfold_pred),
            "model": model,
        })
    results.append({
        "n_neighbors": nn,
        "results": results_fold
    })

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

### 5. Selecionar o modelo vencedor

In [100]:
for item in results:
    fold_result = [i["val"] for i in item["results"]]
    
    keys = list(fold_result[0].keys())
    keys = [k for k in keys if k != "cm"]
    
    fold_result_df = pd.DataFrame(fold_result)
    fold_result_df_mean = fold_result_df[keys].mean()
    fold_result_df_std = fold_result_df[keys].std()

    print("VAL Model: k = ", item['n_neighbors'])
    for k in keys:
        print(k, "%.2f"%fold_result_df_mean[k], "± %0.2f"%fold_result_df_std[k])
    print()

    fold_result = [i["train"] for i in item["results"]]
    
    keys = list(fold_result[0].keys())
    keys = [k for k in keys if k != "cm"]
    
    fold_result_df = pd.DataFrame(fold_result)
    fold_result_df_mean = fold_result_df[keys].mean()
    fold_result_df_std = fold_result_df[keys].std()

    print("TRAIN Model: k = ", item['n_neighbors'])
    for k in keys:
        print(k, "%.2f"%fold_result_df_mean[k], "± %0.2f"%fold_result_df_std[k])
    print("")
    print("---")

VAL Model: k =  5
accuracy 0.74 ± 0.11
recall 0.54 ± 0.24
precision 0.94 ± 0.03
f1 0.66 ± 0.18

TRAIN Model: k =  5
accuracy 0.86 ± 0.11
recall 0.74 ± 0.21
precision 0.99 ± 0.00
f1 0.83 ± 0.15

---
VAL Model: k =  10
accuracy 0.89 ± 0.02
recall 0.83 ± 0.04
precision 0.97 ± 0.01
f1 0.89 ± 0.02

TRAIN Model: k =  10
accuracy 0.92 ± 0.02
recall 0.87 ± 0.04
precision 0.98 ± 0.01
f1 0.92 ± 0.02

---
VAL Model: k =  15
accuracy 0.93 ± 0.00
recall 0.92 ± 0.02
precision 0.95 ± 0.01
f1 0.94 ± 0.01

TRAIN Model: k =  15
accuracy 0.95 ± 0.00
recall 0.94 ± 0.01
precision 0.97 ± 0.01
f1 0.95 ± 0.00

---
VAL Model: k =  20
accuracy 0.93 ± 0.01
recall 0.91 ± 0.02
precision 0.96 ± 0.01
f1 0.94 ± 0.01

TRAIN Model: k =  20
accuracy 0.95 ± 0.00
recall 0.93 ± 0.01
precision 0.97 ± 0.00
f1 0.95 ± 0.00

---
VAL Model: k =  25
accuracy 0.95 ± 0.00
recall 0.94 ± 0.01
precision 0.96 ± 0.00
f1 0.95 ± 0.00

TRAIN Model: k =  25
accuracy 0.96 ± 0.00
recall 0.95 ± 0.01
precision 0.96 ± 0.00
f1 0.96 ± 0.00

---
VA

In [101]:
results

[{'n_neighbors': 5,
  'results': [{'tfidf': TfidfVectorizer(max_features=5000, min_df=10),
    'fold': 0,
    'train': {'accuracy': 0.8643668489692891,
     'recall': 0.7580695953393166,
     'precision': 0.9845603271983641,
     'f1': 0.8565963882216885,
     'cm': array([[10917,   151],
            [ 3073,  9629]])},
    'val': {'accuracy': 0.6654888103651354,
     'recall': 0.39440603394091767,
     'precision': 0.9536474164133738,
     'f1': 0.5580257892396621,
     'cm': array([[2700,   61],
            [1927, 1255]])},
    'model': KNeighborsClassifier()},
   {'tfidf': TfidfVectorizer(max_features=5000, min_df=10),
    'fold': 1,
    'train': {'accuracy': 0.6824148085822466,
     'recall': 0.4131340935902477,
     'precision': 0.9837078651685394,
     'f1': 0.5818886734976461,
     'cm': array([[10968,    87],
            [ 7462,  5253]])},
    'val': {'accuracy': 0.6373885243143194,
     'recall': 0.32912590722625434,
     'precision': 0.9729477611940298,
     'f1': 0.4918651261