## Main Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## Data Preprocessing

In [None]:
df_train  = pd.read_csv("data/train.csv")
df_sample = pd.read_csv("data/sample_submission.csv")
df_test   = pd.read_csv("data/train.csv")

labels = ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III' , 'Overweight_Level_I', 'Overweight_Level_II']

# drop id (not useful)
df_train.drop('id',axis=1,inplace=True)
df_test.drop('id',axis=1,inplace=True)

## Response Factory

In [None]:
import time

class Response:
  def __init__(self, predictions, features, predict_proba, start_time):
    self.predictions = predictions
    self.features = features
    self.predict_proba = predict_proba
    self.execution_time = time.time() - start_time
    
  def get_predictions(self): return self.predictions
  
  def get_features(self): return self.features
  
  def get_time(self): return self.execution_time
  
  def get_predict_proba(self): return self.predict_proba
  
  def get_stats(self):
    print_stats(self.get_predictions(), self.get_features(), self.get_time())
    

## Additional Features

In [None]:
df_train['BMI'] = (df_train['Weight'] / df_train['Height']**2)

In [None]:
numerical_cols   = df_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = df_train.select_dtypes(include=[object]).columns.tolist()

num_numerical_cols = len(numerical_cols)
num_numerical_rows = num_numerical_cols // 4

num_categorical_cols = len(categorical_cols)
num_categorical_rows = num_categorical_cols // 4

## Data Spliting and Processing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize

y = df_train['NObeyesdad']
x = df_train.copy().drop(columns=['NObeyesdad'])

categoricals_cols_no_result = [col for col in categorical_cols if col != "NObeyesdad"]
x = pd.get_dummies(x, columns=categoricals_cols_no_result, drop_first=True)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.20)

y_test_bin = label_binarize(y_test, classes=labels)
n_classes = y_test_bin.shape[1]

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Stats Functions

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,ConfusionMatrixDisplay
import time

def print_stats(predictions, features, time):
  print("Accuracy Score: " + str(round(accuracy_score(y_test, predictions), 4) * 100) + "%")
  print("Precision Score: " + str(round(precision_score(y_test, predictions, average="macro"), 4) * 100) + "%")
  print("Recall Score: " + str(round(recall_score(y_test, predictions, average='macro'), 4) * 100) + "%")
  print("F1 Score: " + str(round(f1_score(y_test,predictions, average='macro'), 2) * 100) + "%")
  if(len(features) > 0): print("Selected Features: " + ','.join(features))
  print("Execution Time: " + str(round(time, 2)) + "s")
  
  cm = confusion_matrix(y_test, predictions, labels=labels)
  
  ConfusionMatrixDisplay(cm).plot()
  
def print_time(start):
  print("Time spent: " + str(round(time.time() - start, 4)) + "s")

## Duplicate and NA Count

In [None]:
# number of na
mv = df_train.isna().sum().sum()

# number of duplicates
dv = df_train.duplicated().sum()

# there are no NAs or Duplicates in the test dataset
print(mv)
print(dv)

## Features Distribution

In [None]:
display(df_train.describe(include=[np.number]).T, df_train.describe(include=[object]).T)

In [None]:
# numerical cols histogram
plt.figure(figsize=(20,6 * num_numerical_rows))
for i ,col in enumerate(numerical_cols,1):
    plt.subplot(num_numerical_rows, 5, i)
    plt.hist(df_train[col])
    plt.title(f'{col} Distribution')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show() 


In [None]:
# categorical cols histogram
plt.figure(figsize=(20,6 * num_categorical_rows))
for i ,col in enumerate(categorical_cols,1):
    plt.subplot(num_categorical_rows, 5, i)
    plt.hist(df_train[col])
    plt.title(f'{col} Distribution')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show() 


## Feature Correlation

In [None]:
def corr(data):
    plt.figure(figsize=(12,10))
    sns.heatmap(data, annot=True, cmap='coolwarm', fmt='.2f', linewidths= 0.5)
    plt.title('Correlation Matrix of Features')
corr(df_train[numerical_cols].corr())

## Parameter Tuning

## Decision Tree

### Without FS

In [None]:
from sklearn.tree import DecisionTreeClassifier
import time

def decision_tree_wfs() -> Response:
  start = time.time()

  model = DecisionTreeClassifier(ccp_alpha=0.01)
  predictions = model.fit(x_train_scaled, y_train).predict(x_test_scaled)
  predicted_proba = model.predict_proba(x_test_scaled)

  return Response(predictions, [], predicted_proba, start)

  # feature_importance = pd.DataFrame(clf.feature_importances_, index = x.columns).sort_values(0, ascending=False)
  # feature_importance.head(10).plot(kind='bar')

### With FS

In [None]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
import time

def decision_tree_fs() -> Response:
  start = time.time()
  
  clf = DecisionTreeClassifier(ccp_alpha=0.01)
  
  max_acc = 0
  max_features = x.shape[1]
  best_features_names = None
  best_predictions = None
  best_proba = None

  for i in range(1,max_features):
    rfe = RFE(estimator=clf, n_features_to_select=i).fit(x_train_scaled, y_train)

    selected_features = x_train.columns[rfe.support_]

    x_train_rfe = rfe.transform(x_train_scaled)
    x_test_rfe = rfe.transform(x_test_scaled)

    clf.fit(x_train_rfe, y_train)
    predictions = clf.predict(x_test_rfe)
    predicted_proba = clf.predict_proba(x_test_rfe)

    acc = accuracy_score(y_test, predictions)
    
    if(acc > max_acc): # From 3 features beyond the accuracy does not change
      max_acc = acc
      best_features_names = selected_features
      best_predictions = predictions
      best_proba = predicted_proba
    
  return Response(best_predictions, best_features_names, best_proba, start)

## K-Nearest-Neighbours

### Finding Optimal K

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict, Counter


number_of_runs = 20
number_of_neighbors = 25

top_k_counts = Counter()
k_accuracy_map = defaultdict(list)

for run in range(1, number_of_runs + 1):
    k_accuracies = []

    for k in range(1, number_of_neighbors + 1):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(x_train_scaled, y_train)
        preds = knn.predict(x_test_scaled)
        acc = accuracy_score(y_test, preds)
        k_accuracies.append((k, acc))

    k_accuracies.sort(key=lambda tup: tup[1], reverse=True)
    
    for i, (k, acc) in enumerate(k_accuracies):
        k_accuracy_map[k].append(acc)
        if i == 0:
            top_k_counts[k] += 1

    best_k, best_acc = k_accuracies[0]

avg_accuracies = {k: np.mean(accs) for k, accs in k_accuracy_map.items()}
sorted_avg = sorted(avg_accuracies.items(), key=lambda x: x[1], reverse=True)

best_avg_k, best_avg_acc = sorted_avg[0]
print(f"\nBest k by average accuracy: k = {best_avg_k}\n")

ks = sorted(avg_accuracies.keys())
accs = [avg_accuracies[k] for k in ks]

plt.figure(figsize=(10, 6))
plt.plot(ks, accs, marker='o')
plt.title("Average Accuracy for each k")
plt.xlabel("k")
plt.ylabel("Average Accuracy")
plt.grid(True)
plt.tight_layout()
plt.show()


### Without FS

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import time

def knn_wfs(num_neighbours) -> Response:
  start = time.time()
  
  model = KNeighborsClassifier(n_neighbors=num_neighbours)
  model.fit(x_train_scaled, y_train)
  predictions = model.predict(x_test_scaled)
  predicted_proba = model.predict_proba(x_test_scaled)
  
  return Response(predictions, [], predicted_proba, start)

### With FS

In [None]:
from sklearn.feature_selection import RFE
import time

def knn_fs(neighbour_number) -> Response:
  start = time.time()
  
  max_acc = 0
  max_features = x.shape[1]
  best_features_names = None
  best_predictions = None
  best_proba = None
  
  knn = KNeighborsClassifier(n_neighbors=neighbour_number)

  for i in range(1, max_features):
    sfs = RFE(
        estimator=knn,
        n_features_to_select=i, 
        step=1
    ).fit(x_train_scaled, y_train)

    selected_features = x_train.columns[sfs.get_support()]

    x_train_sfs = sfs.transform(x_train_scaled)
    x_test_sfs = sfs.transform(x_test_scaled)

    knn.fit(x_train_sfs, y_train)

    predictions = knn.predict(x_test_sfs)
    predicted_proba = knn.predict_proba(x_test_sfs)
    acc = accuracy_score(y_test, predictions)

    if(acc > max_acc):
      max_acc = acc
      best_predictions = predictions
      best_features_names = selected_features
      best_proba = predicted_proba
      
  return Response(best_predictions, best_features_names, best_proba, start)

## Support Vector Machines

### Without FS

In [None]:
from sklearn.svm import SVC
import time

def svc_wfs() -> Response:
  start = time.time()
  model = SVC(probability=True)
  model.fit(x_train_scaled, y_train)
  predictions = model.predict(x_test_scaled)
  predicted_proba = model.predict_proba(x_test_scaled)

  return Response(predictions, [], predicted_proba, start)

### With FS

In [None]:
from sklearn.feature_selection import RFE
import time 

def svc_fs() -> Response:
  start = time.time()
  
  print("fitting")
  svc = SVC(kernel="linear", probability=True)
  print("fitted")
  
  max_acc = 0
  best_features_names = None
  max_features = x.shape[1]
  best_predictions = None
  best_proba = None

  for i in range(1,max_features):
    print("sfs")
    sfs = RFE(
        estimator=svc,
        n_features_to_select=i, 
        step=1
    ).fit(x_train_scaled, y_train)
    
    selected_features = x_train.columns[sfs.get_support()]
    print("Selected features: " + str(selected_features))
    
    x_train_sfs = sfs.transform(x_train_scaled)
    x_test_sfs = sfs.transform(x_test_scaled)

    svc.fit(x_train_sfs, y_train)

    predictions = svc.predict(x_test_sfs)
    predicted_proba = svc.predict_proba(x_test_sfs)
    acc = accuracy_score(y_test, predictions)

    if(acc > max_acc):
      max_acc = acc
      best_predictions = predictions
      best_features_names = selected_features
      best_proba = predicted_proba
      
  return Response(best_predictions, best_features_names, best_proba, start)

## Neural Networks

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import time

def neural_networks() -> Response:
    start = time.time()
    
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    model = MLPClassifier(
        hidden_layer_sizes=(128, 64, 32),
        activation='relu',
        solver='adam',                    
        alpha=0.0001,                      
        batch_size='auto',
        learning_rate='adaptive',         
        max_iter=1000,                     
        early_stopping=True,               
        validation_fraction=0.2,          
        n_iter_no_change=10,          
    )
    
    model.fit(x_train_scaled, y_train_encoded)
    predictions_encoded = model.predict(x_test_scaled)
    predicted_proba = model.predict_proba(x_test_scaled)
    
    predictions = label_encoder.inverse_transform(predictions_encoded)

    mse = mean_squared_error(y_test_encoded, predictions_encoded)

    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    plt.scatter(y_test_encoded, predictions_encoded, alpha=0.5, color="red", label="Predicted")
    plt.scatter(y_test_encoded, y_test_encoded, alpha=0.5, color='blue', label='Actual')
    plt.plot(y_test_encoded, y_test_encoded, color='green', linewidth=2)
    plt.title('Neural Network Predicted vs. Actual Values')
    plt.legend()
    plt.show()

    return Response(predictions, [], predicted_proba, start)

## ROC

In [None]:
from sklearn.metrics import RocCurveDisplay

def ROC(response):

  for i in range(n_classes):
      RocCurveDisplay.from_predictions(
          y_test_bin[:, i],
          response.get_predict_proba()[:, i],
          name=f"DT - Class {i}",
      )

## Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

clf = DecisionTreeClassifier(ccp_alpha=0.01)
knn = KNeighborsClassifier(n_neighbors=6)
mlp = MLPClassifier(
    hidden_layer_sizes=(250, 150, 100),
    activation='relu',
    solver='adam',                    
    alpha=0.0001,                      
    batch_size='auto',
    learning_rate='adaptive',         
    max_iter=1000,                     
    early_stopping=True,               
    validation_fraction=0.2,          
    n_iter_no_change=10,          
)
svc = SVC()

ensemble = VotingClassifier(estimators=[("clf", clf), ("knn", knn), ("mlp", mlp), ("svc", svc)], voting="hard")

ensemble.fit(x_train_scaled, y_train)
print(f"Accuracy of the ensemble: {round(ensemble.score(x_test_scaled, y_test)*100, 2)} %")

## TESTE

In [None]:
# decision_tree_wfs().get_stats() # 85.21
# decision_tree_fs().get_stats() # 85.21

ROC(decision_tree_wfs())

# knn_wfs(8).get_stats() # 78.47
# knn_fs(8).get_stats() # 87.14

# svc_wfs().get_stats() # 87.24
# svc_fs().get_stats() # 87.52

# neural_networks().get_stats() # 88.32%
