## Main Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## Data Preprocessing

In [None]:
df_train  = pd.read_csv("data/train.csv")
df_sample = pd.read_csv("data/sample_submission.csv")
df_test   = pd.read_csv("data/train.csv")

labels = ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III' , 'Overweight_Level_I', 'Overweight_Level_II']

# drop id (not useful)
df_train.drop('id',axis=1,inplace=True)
df_test.drop('id',axis=1,inplace=True)

## Additional Features

In [None]:
df_train['BMI'] = (df_train['Weight'] / df_train['Height']**2)

In [None]:
numerical_cols   = df_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = df_train.select_dtypes(include=[object]).columns.tolist()

num_numerical_cols = len(numerical_cols)
num_numerical_rows = num_numerical_cols // 4

num_categorical_cols = len(categorical_cols)
num_categorical_rows = num_categorical_cols // 4

In [None]:
from sklearn.model_selection import train_test_split

y = df_train['NObeyesdad']
x = df_train.copy().drop(columns=['NObeyesdad'])

categoricals_cols_no_result = [col for col in categorical_cols if col != "NObeyesdad"]
x = pd.get_dummies(x, columns=categoricals_cols_no_result, drop_first=True)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.20)

In [None]:
plt.hist(y)
plt.title(f'NObeyesdad Distribution')
plt.xticks(rotation=90)
plt.show() 


In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

def get_stats(predictions):
  result = {}
  result['acc_score'] = accuracy_score(y_test,predictions)
  result['prec_score'] = precision_score(y_test,predictions,average='macro')
  result['recall_score'] = recall_score(y_test,predictions,average='macro')
  result['f1_score'] = f1_score(y_test,predictions,average='macro')
  result['confusion_matrix'] = confusion_matrix(y_test,predictions)
  return result

In [None]:
"""
Balancing the classes only decreases the accuracy

# Count the occurrences of each class in the 'NObeyesdad' column
class_counts = df_train['NObeyesdad'].value_counts()
print("Class counts before balancing:")
print(class_counts)

# Find the minimum count (least represented class)
min_count = class_counts.min()

# Balance the dataset by undersampling
df_train = df_train.groupby('NObeyesdad').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
"""

In [None]:
print(df_train.shape)

In [None]:
# number of na
mv = df_train.isna().sum().sum()

# number of duplicates
dv = df_train.duplicated().sum()

# there are no NAs or Duplicates in the test dataset
print(mv)
print(dv)

In [None]:
display(df_train.describe(include=[np.number]).T, df_train.describe(include=[object]).T)

In [None]:
# numerical cols histogram
plt.figure(figsize=(20,6 * num_numerical_rows))
for i ,col in enumerate(numerical_cols,1):
    plt.subplot(num_numerical_rows, 5, i)
    plt.hist(df_train[col])
    plt.title(f'{col} Distribution')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show() 


In [None]:
# categorical cols histogram
plt.figure(figsize=(20,6 * num_categorical_rows))
for i ,col in enumerate(categorical_cols,1):
    plt.subplot(num_categorical_rows, 5, i)
    plt.hist(df_train[col])
    plt.title(f'{col} Distribution')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show() 


In [None]:
def corr(data):
    plt.figure(figsize=(12,10))
    sns.heatmap(data, annot=True, cmap='coolwarm', fmt='.2f', linewidths= 0.5)
    plt.title('Correlation Matrix of Features')
corr(df_train[numerical_cols].corr())

## Decision Tree

### Without FS

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(ccp_alpha=0.01)
clf = clf.fit(x_train,y_train)

predictions = clf.predict(x_test)

feature_importance = pd.DataFrame(clf.feature_importances_, index = x.columns).sort_values(0, ascending=False)
feature_importance.head(10).plot(kind='bar')

### With FS

In [None]:
from sklearn.feature_selection import RFE

max_acc = 0
best_features=0
max_features = x.shape[1]

for i in range(1,max_features):
  rfe = RFE(estimator=clf, n_features_to_select=i)

  rfe.fit(x_train, y_train)

  selected_features = x_train.columns[rfe.support_]
  print("Selected Features:", selected_features)

  x_train_rfe = rfe.transform(x_train)
  x_test_rfe = rfe.transform(x_test)

  clf.fit(x_train_rfe, y_train)
  predictions = clf.predict(x_test_rfe)

  acc = accuracy_score(y_test,predictions)
  if(acc > max_acc):
    max_acc = acc
    best_features = i

# .84
# 14
print(max_acc)
print(best_features)

## K-Nearest-Neighbours

### Without FS

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

knn = KNeighborsClassifier(n_neighbors=8)
knn = knn.fit(x_train_scaled, y_train)

predictions = knn.predict(x_test_scaled)

### With FS

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

max_acc = 0
best_features=0
max_features = x.shape[1]

for i in range(1,max_features):
  sfs = SequentialFeatureSelector(
      estimator=knn,
      n_features_to_select=i, 
      direction='forward',
      scoring='accuracy',
      cv=5,
      n_jobs=-1
  )

  sfs.fit(x_train, y_train)

  selected_features = x_train.columns[sfs.get_support()]
  print("Selected Features:", selected_features)

  x_train_sfs = sfs.transform(x_train)
  x_test_sfs = sfs.transform(x_test)

  knn.fit(x_train_sfs, y_train)

  predictions = knn.predict(x_test_sfs)
  acc = accuracy_score(y_test, predictions)

  if(acc > max_acc):
    max_acc = acc
    best_features = i

print(max_acc)
print(best_features)

Selected Features: Index(['BMI'], dtype='object')
Selected Features: Index(['Weight', 'BMI'], dtype='object')
Selected Features: Index(['Weight', 'BMI', 'Gender_Male'], dtype='object')
Selected Features: Index(['Age', 'Weight', 'BMI', 'Gender_Male'], dtype='object')
Selected Features: Index(['Age', 'Weight', 'BMI', 'Gender_Male', 'CALC_no'], dtype='object')
Selected Features: Index(['Age', 'Weight', 'FAF', 'BMI', 'Gender_Male', 'CALC_no'], dtype='object')
Selected Features: Index(['Age', 'Weight', 'FAF', 'BMI', 'Gender_Male', 'FAVC_yes', 'CALC_no'], dtype='object')
Selected Features: Index(['Age', 'Weight', 'CH2O', 'FAF', 'BMI', 'Gender_Male', 'FAVC_yes',
       'CALC_no'],
      dtype='object')
Selected Features: Index(['Age', 'Weight', 'CH2O', 'FAF', 'BMI', 'Gender_Male',
       'family_history_with_overweight_yes', 'FAVC_yes', 'CALC_no'],
      dtype='object')
Selected Features: Index(['Age', 'Weight', 'CH2O', 'FAF', 'BMI', 'Gender_Male',
       'family_history_with_overweight_yes',

## Support Vector Machines

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train,y_train)

predictions = svc.predict(x_test)

### Support Vector Classifier

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

max_acc = 0
best_features=0
max_features = x.shape[1]

for i in range(1,max_features):
  sfs = SequentialFeatureSelector(
      estimator=svc,
      n_features_to_select=i, 
      direction='forward',
      scoring='accuracy',
      cv=5,
      n_jobs=-1
  )
  
  sfs.fit(x_train, y_train)

  selected_features = x_train.columns[sfs.get_support()]
  print("Selected Features:", selected_features)
  
  x_train_sfs = sfs.transform(x_train)
  x_test_sfs = sfs.transform(x_test)

  svc.fit(x_train_sfs, y_train)

  predictions = svc.predict(x_test_sfs)
  acc = accuracy_score(y_test, predictions)

  if(acc > max_acc):
    max_acc = acc
    best_features = i
    print("Current best:" + str(best_features))

print(max_acc)
print(best_features)

# Neural Networks

In [None]:
from sklearn.metrics import mean_squared_error, ConfusionMatrixDisplay
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

y_test_encoded = label_encoder.transform(y_test)

model = MLPClassifier(
    hidden_layer_sizes=(250, 150, 100),
    activation='relu',
    solver='adam',                    
    alpha=0.0001,                      
    batch_size='auto',
    learning_rate='adaptive',         
    max_iter=1000,                     
    early_stopping=True,               
    validation_fraction=0.2,          
    n_iter_no_change=10,          
)

model.fit(x_train_scaled, y_train_encoded)
y_pred = model.predict(x_test_scaled)
mse = mean_squared_error(y_test_encoded, y_pred)

plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.scatter(y_test_encoded, y_pred, alpha=0.5, color="red", label="Predicted")
plt.scatter(y_test_encoded, y_test_encoded, alpha=0.5, color='blue', label='Actual')
plt.plot(y_test_encoded, y_test_encoded, color='green', linewidth=2)
plt.title('Neural Network Predicted vs. Actual Values')
plt.legend()
plt.show()

print(accuracy_score(y_test_encoded,y_pred)) # 0.87
print(classification_report(y_test_encoded, y_pred, target_names=labels))

cm_display = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test_encoded,y_pred), display_labels=labels)
fig, ax = plt.subplots(figsize=(10, 8))
cm_display.plot(ax=ax)

plt.xticks(rotation=90)
plt.tight_layout()
plt.show()