### Imports

In [210]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score, recall_score, precision_score, f1_score,
                             confusion_matrix, roc_auc_score, roc_curve, RocCurveDisplay)

from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import copy as copy
import unidecode

import nltk
from nltk import tokenize
from string import punctuation

### 1. Carregar os dados

In [211]:
df = pd.read_csv('text.csv')

df = df[(df['label'] == 0) | (df['label'] == 1)].copy().reset_index(drop=True)
df = df.drop(columns=df.columns[0], axis=1)

df.head()

Unnamed: 0,text,label
0,ive enjoyed being able to slouch about relax a...,0.0
1,i dont know i feel so lost,0.0
2,i was beginning to feel quite disheartened,0.0
3,i fear that they won t ever feel that deliciou...,1.0
4,i can still lose the weight without feeling de...,0.0


### Pré-Processamento

In [212]:
def df_lower(df_column):
    return df_column.apply(lambda x : x.lower())

def remove_accents(df_column):
    return [unidecode.unidecode(text) for text in df_column]

def remove_stopword(df, column_name):
    stopwords = nltk.corpus.stopwords.words("english")
    tokenizer = tokenize.WordPunctTokenizer()
    
    for ponto in punctuation:
        stopwords.append(ponto)
    
    frase_processada = list()
    for review in df[column_name]:
      nova_frase = list()
      palavras_texto = tokenizer.tokenize(review)
      for palavra in palavras_texto:
          if palavra not in stopwords:
              nova_frase.append(palavra)
      frase_processada.append(' '.join(nova_frase))

    return frase_processada

df.text = df_lower(df.text)
df.text = remove_accents(df.text)
df.text = remove_stopword(df, 'text')
df.head()

Unnamed: 0,text,label
0,ive enjoyed able slouch relax unwind frankly n...,0.0
1,dont know feel lost,0.0
2,beginning feel quite disheartened,0.0
3,fear ever feel delicious excitement christmas ...,1.0
4,still lose weight without feeling deprived,0.0


### 2. Separar entre treino e teste

In [213]:
x = df['text']
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, test_size=0.2, stratify=y)

### 3. Criar validação cruzada com o conjunto de treino + Criar as features com TD-IDF

In [214]:
n_splits = 5
kfolds = KFold(n_splits = n_splits)
n_neighbors = list(range(5, 51, 5))

In [215]:
# Functions

def define_kfold_subsets(x_train, y_train, train_idx, val_idx):
    # Train subset
    x_train_kfold = x_train.iloc[train_idx]
    x_val_kfold = x_train.iloc[val_idx]
    
    # Validation subset
    y_train_kfold = y_train.iloc[train_idx]
    y_val_kfold = y_train.iloc[val_idx]

    return x_train_kfold, x_val_kfold, y_train_kfold, y_val_kfold
    

def vectorizer(x_train_kfold, x_val_kfold):
    tfidf = TfidfVectorizer(min_df = 10, max_features = 5000)
    bow_train = tfidf.fit_transform(x_train_kfold)
    bow_val = tfidf.transform(x_val_kfold)

    return tfidf, bow_train, bow_val
    

def score(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "cm": confusion_matrix(y_true, y_pred),
    }

In [216]:
results = []

for nn in tqdm(n_neighbors):

    results_fold = []
    
    for fold, (train_idx, val_idx) in tqdm(enumerate(kfolds.split(y_train)), total = n_splits):
        # CREATE KNN
        model = KNeighborsClassifier(n_neighbors = nn)
        
        # Define subsets
        x_train_kfold, x_val_kfold, y_train_kfold, y_val_kfold = define_kfold_subsets(x_train, y_train, train_idx, val_idx)
        
        # Vectorizer subsets
        tfidf, bow_train, bow_val = vectorizer(x_train_kfold, x_val_kfold)
        
        # Train
        model.fit(bow_train, y_train_kfold)

        # Assess Performance
        y_train_kfold_pred = model.predict(bow_train)
        y_val_kfold_pred = model.predict(bow_val)

        results_fold.append({
            "tfidf": copy.deepcopy(tfidf),
            "fold": fold,
            "train": score(y_train_kfold, y_train_kfold_pred),
            "val": score(y_val_kfold, y_val_kfold_pred),
            "model": copy.deepcopy(model),
        })
    results.append({
        "n_neighbors": nn,
        "results": results_fold
    })

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

### 4. Avaliar resultados de treino

In [217]:
for item in results:
    fold_result = [i["val"] for i in item["results"]]
    
    keys = list(fold_result[0].keys())
    keys = [k for k in keys if k != "cm"]
    
    fold_result_df = pd.DataFrame(fold_result)
    fold_result_df_mean = fold_result_df[keys].mean()
    fold_result_df_std = fold_result_df[keys].std()

    print("VAL Model: k = ", item['n_neighbors'])
    for k in keys:
        print(k, "%.2f"%fold_result_df_mean[k], "± %0.2f"%fold_result_df_std[k])
    print()

    fold_result = [i["train"] for i in item["results"]]
    
    keys = list(fold_result[0].keys())
    keys = [k for k in keys if k != "cm"]
    
    fold_result_df = pd.DataFrame(fold_result)
    fold_result_df_mean = fold_result_df[keys].mean()
    fold_result_df_std = fold_result_df[keys].std()

    print("TRAIN Model: k = ", item['n_neighbors'])
    for k in keys:
        print(k, "%.2f"%fold_result_df_mean[k], "± %0.2f"%fold_result_df_std[k])
    print("")
    print("---")

VAL Model: k =  5
accuracy 0.74 ± 0.11
recall 0.54 ± 0.24
precision 0.94 ± 0.03
f1 0.66 ± 0.18

TRAIN Model: k =  5
accuracy 0.86 ± 0.11
recall 0.74 ± 0.21
precision 0.99 ± 0.00
f1 0.83 ± 0.15

---
VAL Model: k =  10
accuracy 0.89 ± 0.02
recall 0.83 ± 0.04
precision 0.97 ± 0.01
f1 0.89 ± 0.02

TRAIN Model: k =  10
accuracy 0.92 ± 0.02
recall 0.87 ± 0.04
precision 0.98 ± 0.01
f1 0.92 ± 0.02

---
VAL Model: k =  15
accuracy 0.93 ± 0.00
recall 0.92 ± 0.02
precision 0.95 ± 0.01
f1 0.94 ± 0.01

TRAIN Model: k =  15
accuracy 0.95 ± 0.00
recall 0.94 ± 0.01
precision 0.97 ± 0.01
f1 0.95 ± 0.00

---
VAL Model: k =  20
accuracy 0.93 ± 0.01
recall 0.91 ± 0.02
precision 0.96 ± 0.01
f1 0.94 ± 0.01

TRAIN Model: k =  20
accuracy 0.95 ± 0.00
recall 0.93 ± 0.01
precision 0.97 ± 0.00
f1 0.95 ± 0.00

---
VAL Model: k =  25
accuracy 0.95 ± 0.00
recall 0.94 ± 0.01
precision 0.96 ± 0.00
f1 0.95 ± 0.00

TRAIN Model: k =  25
accuracy 0.96 ± 0.00
recall 0.95 ± 0.01
precision 0.96 ± 0.00
f1 0.96 ± 0.00

---
VA

### 5. Avaliar resultados de teste

In [218]:
results_test = []

for n_knn in tqdm(results):
    results_fold = []
    for kfold in tqdm(n_knn['results']):
        bow_test = kfold['tfidf'].transform(x_test)
        y_test_pred = kfold['model'].predict(bow_test)
        results_fold.append({
            "fold": kfold['fold'],
            "test": score(y_test, y_test_pred),
        })
    results_test.append({
        "n_neighbors": n_knn['n_neighbors'],
        "results": results_fold
    })

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [221]:
results_test[0]

{'n_neighbors': 5,
 'results': [{'fold': 0,
   'test': {'accuracy': 0.6614618387400727,
    'recall': 0.38907076303198185,
    'precision': 0.945532435740514,
    'f1': 0.5512934879571811,
    'cm': array([[3369,   89],
           [2426, 1545]])}},
  {'fold': 1,
   'test': {'accuracy': 0.6341364921254543,
    'recall': 0.3268698060941828,
    'precision': 0.9664929262844378,
    'f1': 0.4885208882197968,
    'cm': array([[3413,   45],
           [2673, 1298]])}},
  {'fold': 2,
   'test': {'accuracy': 0.8347018441243774,
    'recall': 0.716192394862755,
    'precision': 0.965704584040747,
    'f1': 0.8224407171775593,
    'cm': array([[3357,  101],
           [1127, 2844]])}},
  {'fold': 3,
   'test': {'accuracy': 0.8664692421591063,
    'recall': 0.8728280030219089,
    'precision': 0.8768024285352897,
    'f1': 0.8748107016658253,
    'cm': array([[2971,  487],
           [ 505, 3466]])}},
  {'fold': 4,
   'test': {'accuracy': 0.6594427244582043,
    'recall': 0.3870561571392596,
    

In [224]:
for item in results_test:
    fold_result = [i["test"] for i in item["results"]]
    
    keys = list(fold_result[0].keys())
    keys = [k for k in keys if k != "cm"]
    
    fold_result_df = pd.DataFrame(fold_result)
    fold_result_df_mean = fold_result_df[keys].mean()
    fold_result_df_std = fold_result_df[keys].std()

    print("VAL Model: k = ", item['n_neighbors'])
    for k in keys:
        print(k, "%.2f"%fold_result_df_mean[k], "± %0.2f"%fold_result_df_std[k])
    print()

VAL Model: k =  5
accuracy 0.73 ± 0.11
recall 0.54 ± 0.24
precision 0.94 ± 0.04
f1 0.66 ± 0.18

VAL Model: k =  10
accuracy 0.89 ± 0.02
recall 0.83 ± 0.04
precision 0.97 ± 0.01
f1 0.89 ± 0.02

VAL Model: k =  15
accuracy 0.94 ± 0.01
recall 0.92 ± 0.02
precision 0.95 ± 0.01
f1 0.94 ± 0.01

VAL Model: k =  20
accuracy 0.94 ± 0.00
recall 0.92 ± 0.01
precision 0.96 ± 0.01
f1 0.94 ± 0.00

VAL Model: k =  25
accuracy 0.95 ± 0.00
recall 0.94 ± 0.01
precision 0.96 ± 0.01
f1 0.95 ± 0.00

VAL Model: k =  30
accuracy 0.95 ± 0.00
recall 0.94 ± 0.01
precision 0.96 ± 0.01
f1 0.95 ± 0.00

VAL Model: k =  35
accuracy 0.95 ± 0.00
recall 0.95 ± 0.01
precision 0.96 ± 0.00
f1 0.95 ± 0.00

VAL Model: k =  40
accuracy 0.95 ± 0.00
recall 0.95 ± 0.00
precision 0.96 ± 0.00
f1 0.96 ± 0.00

VAL Model: k =  45
accuracy 0.95 ± 0.00
recall 0.96 ± 0.00
precision 0.96 ± 0.01
f1 0.96 ± 0.00

VAL Model: k =  50
accuracy 0.96 ± 0.00
recall 0.95 ± 0.00
precision 0.96 ± 0.00
f1 0.96 ± 0.00

