### Imports

In [173]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score, recall_score, precision_score, f1_score,
                             confusion_matrix, roc_auc_score, roc_curve, RocCurveDisplay)

from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import copy as copy
import unidecode

import nltk
from nltk import tokenize
from string import punctuation

### 1. Carregar os dados

In [174]:
df = pd.read_csv('text.csv')

df = df[(df['label'] == 0) | (df['label'] == 1)].copy().reset_index(drop=True)
df = df.drop(columns=df.columns[0], axis=1)

df.head()

Unnamed: 0,text,label
0,ive enjoyed being able to slouch about relax a...,0.0
1,i dont know i feel so lost,0.0
2,i was beginning to feel quite disheartened,0.0
3,i fear that they won t ever feel that deliciou...,1.0
4,i can still lose the weight without feeling de...,0.0


### Pré-Processamento

In [175]:
def df_lower(df_column):
    return df_column.apply(lambda x : x.lower())

def remove_accents(df_column):
    return [unidecode.unidecode(text) for text in df_column]

def remove_stopword(df, column_name):
    stopwords = nltk.corpus.stopwords.words("english")
    tokenizer = tokenize.WordPunctTokenizer()
    
    for ponto in punctuation:
        stopwords.append(ponto)
    
    frase_processada = list()
    for review in df[column_name]:
      nova_frase = list()
      palavras_texto = tokenizer.tokenize(review)
      for palavra in palavras_texto:
          if palavra not in stopwords:
              nova_frase.append(palavra)
      frase_processada.append(' '.join(nova_frase))

    return frase_processada

df.text = df_lower(df.text)
df.text = remove_accents(df.text)
df.text = remove_stopword(df, 'text')
df.head()

Unnamed: 0,text,label
0,ive enjoyed able slouch relax unwind frankly n...,0.0
1,dont know feel lost,0.0
2,beginning feel quite disheartened,0.0
3,fear ever feel delicious excitement christmas ...,1.0
4,still lose weight without feeling deprived,0.0


### 2. Separar entre treino e teste

In [176]:
x = df['text']
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, test_size=0.99, stratify=y)

### 3. Criar validação cruzada com o conjunto de treino + Criar as features com TD-IDF

In [177]:
n_splits = 5
kfolds = KFold(n_splits = n_splits)
n_neighbors = list(range(5, 51, 5))

In [178]:
# Functions

def define_kfold_subsets(x_train, y_train, train_idx, val_idx):
    # Train subset
    x_train_kfold = x_train.iloc[train_idx]
    x_val_kfold = x_train.iloc[val_idx]
    
    # Validation subset
    y_train_kfold = y_train.iloc[train_idx]
    y_val_kfold = y_train.iloc[val_idx]

    return x_train_kfold, x_val_kfold, y_train_kfold, y_val_kfold
    

def vectorizer(x_train_kfold, x_val_kfold):
    tfidf = TfidfVectorizer(min_df = 10, max_features = 5000)
    bow_train = tfidf.fit_transform(x_train_kfold)
    bow_val = tfidf.transform(x_val_kfold)

    return tfidf, bow_train, bow_val
    

def score(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "cm": confusion_matrix(y_true, y_pred),
    }

In [179]:
results = []

for nn in tqdm(n_neighbors):

    results_fold = []
    
    for fold, (train_idx, val_idx) in tqdm(enumerate(kfolds.split(y_train)), total = n_splits):
        # CREATE KNN
        model = KNeighborsClassifier(n_neighbors = nn)
        
        # Define subsets
        x_train_kfold, x_val_kfold, y_train_kfold, y_val_kfold = define_kfold_subsets(x_train, y_train, train_idx, val_idx)
        
        # Vectorizer subsets
        tfidf, bow_train, bow_val = vectorizer(x_train_kfold, x_val_kfold)
        
        # Train
        model.fit(bow_train, y_train_kfold)

        # Assess Performance
        y_train_kfold_pred = model.predict(bow_train)
        y_val_kfold_pred = model.predict(bow_val)

        results_fold.append({
            "tfidf": copy.deepcopy(tfidf),
            "fold": fold,
            "train": score(y_train_kfold, y_train_kfold_pred),
            "val": score(y_val_kfold, y_val_kfold_pred),
            "model": copy.deepcopy(model),
        })
    results.append({
        "n_neighbors": nn,
        "results": results_fold
    })

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

### 4. Avaliar resultados de treino

In [180]:
for item in results:
    fold_result = [i["val"] for i in item["results"]]
    
    keys = list(fold_result[0].keys())
    keys = [k for k in keys if k != "cm"]
    
    fold_result_df = pd.DataFrame(fold_result)
    fold_result_df_mean = fold_result_df[keys].mean()
    fold_result_df_std = fold_result_df[keys].std()

    print("VAL Model: k = ", item['n_neighbors'])
    for k in keys:
        print(k, "%.2f"%fold_result_df_mean[k], "± %0.2f"%fold_result_df_std[k])
    print()

    fold_result = [i["train"] for i in item["results"]]
    
    keys = list(fold_result[0].keys())
    keys = [k for k in keys if k != "cm"]
    
    fold_result_df = pd.DataFrame(fold_result)
    fold_result_df_mean = fold_result_df[keys].mean()
    fold_result_df_std = fold_result_df[keys].std()

    print("TRAIN Model: k = ", item['n_neighbors'])
    for k in keys:
        print(k, "%.2f"%fold_result_df_mean[k], "± %0.2f"%fold_result_df_std[k])
    print("")
    print("---")

VAL Model: k =  5
accuracy 0.51 ± 0.04
recall 0.62 ± 0.21
precision 0.55 ± 0.11
f1 0.56 ± 0.09

TRAIN Model: k =  5
accuracy 0.58 ± 0.01
recall 0.69 ± 0.21
precision 0.61 ± 0.05
f1 0.63 ± 0.08

---
VAL Model: k =  10
accuracy 0.52 ± 0.04
recall 0.37 ± 0.07
precision 0.59 ± 0.06
f1 0.45 ± 0.05

TRAIN Model: k =  10
accuracy 0.57 ± 0.02
recall 0.43 ± 0.06
precision 0.64 ± 0.03
f1 0.51 ± 0.05

---
VAL Model: k =  15
accuracy 0.52 ± 0.05
recall 0.45 ± 0.08
precision 0.56 ± 0.07
f1 0.50 ± 0.07

TRAIN Model: k =  15
accuracy 0.56 ± 0.02
recall 0.52 ± 0.06
precision 0.60 ± 0.02
f1 0.56 ± 0.04

---
VAL Model: k =  20
accuracy 0.54 ± 0.04
recall 0.47 ± 0.09
precision 0.58 ± 0.05
f1 0.52 ± 0.07

TRAIN Model: k =  20
accuracy 0.56 ± 0.02
recall 0.50 ± 0.07
precision 0.60 ± 0.03
f1 0.55 ± 0.04

---
VAL Model: k =  25
accuracy 0.51 ± 0.06
recall 0.62 ± 0.24
precision 0.54 ± 0.07
f1 0.56 ± 0.12

TRAIN Model: k =  25
accuracy 0.55 ± 0.03
recall 0.67 ± 0.19
precision 0.57 ± 0.03
f1 0.60 ± 0.07

---
VA

### 5. Avaliar resultados de teste

In [181]:
results_test = []

for n_knn in results:
    results_fold = []
    for kfold in n_knn['results']:
        bow_test = kfold['tfidf'].transform(x_test)
        y_test_pred = kfold['model'].predict(bow_test)
        results_fold.append({
            "fold": kfold['fold'],
            "test": score(y_test, y_test_pred),
        })
    results_test.append({
        "n_neighbors": n_knn['n_neighbors'],
        "results": results_fold
    })

In [187]:
results_test

[{'n_neighbors': 5,
  'results': [{'fold': 0,
    'test': {'accuracy': 0.5296021321149819,
     'recall': 0.7746858625426057,
     'precision': 0.541998861047836,
     'f1': 0.6377819194605574,
     'cm': array([[ 4246, 12868],
            [ 4429, 15228]])}},
   {'fold': 1,
    'test': {'accuracy': 0.49609746811345895,
     'recall': 0.4576486747723457,
     'precision': 0.5334440227703985,
     'f1': 0.49264806549656365,
     'cm': array([[ 9246,  7868],
            [10661,  8996]])}},
   {'fold': 2,
    'test': {'accuracy': 0.5278344347447717,
     'recall': 0.8214376557969171,
     'precision': 0.5382512750425014,
     'f1': 0.6503544385371355,
     'cm': array([[ 3262, 13852],
            [ 3510, 16147]])}},
   {'fold': 3,
    'test': {'accuracy': 0.5378151260504201,
     'recall': 0.6950704583608892,
     'precision': 0.5539652935452481,
     'f1': 0.616547460571738,
     'cm': array([[ 6113, 11001],
            [ 5994, 13663]])}},
   {'fold': 4,
    'test': {'accuracy': 0.4953631