In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import catboost
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
df_train = pd.read_csv("train-obesity.csv")
df_test = pd.read_csv("test-obesity.csv")

In [None]:
# Store 'ids' for future use
train_ids = df_train['id']
test_ids = df_test['id']

# Remove ID
df_train = df_train.drop('id',axis = 1)
df_test = df_test.drop('id',axis = 1)

# Store Categorical Features

In [None]:
# Categorical Features
cat_feat = df_train.select_dtypes(exclude=['number']).columns.tolist()
cat_feat_x = [col for col in cat_feat if col != 'NObeyesdad' ]
cat_feat_indices = [df_train.columns.get_loc(col) for col in cat_feat_x]

## X and Y Split

In [None]:
x_train = df_train.loc[:, ~df_train.columns.isin(['NObeyesdad'])]
y_train = df_train['NObeyesdad']

x_test = df_test.loc[:, ~df_test.columns.isin(['NObeyesdad'])]

# Boosting


In [None]:
# Catboost
param_grid = {'iterations': [50, 100, 150],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2]}
cat = CatBoostClassifier(cat_features = cat_feat_indices,loss_function = 'MultiClass', eval_metric = "Accuracy",verbose = True)
grid_search = GridSearchCV(estimator=cat, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

0:	learn: 0.7581174	total: 144ms	remaining: 21.4s
1:	learn: 0.8070623	total: 277ms	remaining: 20.5s
2:	learn: 0.8274882	total: 413ms	remaining: 20.2s
3:	learn: 0.8338954	total: 547ms	remaining: 20s
4:	learn: 0.8464688	total: 693ms	remaining: 20.1s
5:	learn: 0.8567299	total: 822ms	remaining: 19.7s
6:	learn: 0.8607766	total: 961ms	remaining: 19.6s
7:	learn: 0.8645823	total: 1.1s	remaining: 19.6s
8:	learn: 0.8682917	total: 1.24s	remaining: 19.5s
9:	learn: 0.8720975	total: 1.38s	remaining: 19.4s
10:	learn: 0.8757587	total: 1.51s	remaining: 19.2s
11:	learn: 0.8765777	total: 1.66s	remaining: 19.1s
12:	learn: 0.8806243	total: 1.82s	remaining: 19.2s
13:	learn: 0.8815878	total: 1.96s	remaining: 19s
14:	learn: 0.8834666	total: 2.12s	remaining: 19s
15:	learn: 0.8850082	total: 2.26s	remaining: 18.9s
16:	learn: 0.8877059	total: 2.4s	remaining: 18.8s
17:	learn: 0.8892475	total: 2.54s	remaining: 18.6s
18:	learn: 0.8905482	total: 2.69s	remaining: 18.5s
19:	learn: 0.8917044	total: 2.84s	remaining: 18.5

In [None]:
best_cat = CatBoostClassifier(cat_features = cat_feat_indices,loss_function = 'MultiClass', eval_metric = "Accuracy", depth = 8, iterations = 150, learning_rate = 0.2, verbose = False)
best_cat.fit(x_train,y_train)

0:	learn: 0.7581174	total: 165ms	remaining: 24.6s
1:	learn: 0.8070623	total: 400ms	remaining: 29.6s
2:	learn: 0.8274882	total: 665ms	remaining: 32.6s
3:	learn: 0.8338954	total: 916ms	remaining: 33.4s
4:	learn: 0.8464688	total: 1.16s	remaining: 33.6s
5:	learn: 0.8567299	total: 1.4s	remaining: 33.5s
6:	learn: 0.8607766	total: 1.59s	remaining: 32.5s
7:	learn: 0.8645823	total: 1.74s	remaining: 30.8s
8:	learn: 0.8682917	total: 1.87s	remaining: 29.3s
9:	learn: 0.8720975	total: 2.01s	remaining: 28.1s
10:	learn: 0.8757587	total: 2.14s	remaining: 27.1s
11:	learn: 0.8765777	total: 2.28s	remaining: 26.3s
12:	learn: 0.8806243	total: 2.44s	remaining: 25.7s
13:	learn: 0.8815878	total: 2.61s	remaining: 25.3s
14:	learn: 0.8834666	total: 2.76s	remaining: 24.8s
15:	learn: 0.8850082	total: 2.91s	remaining: 24.4s
16:	learn: 0.8877059	total: 3.05s	remaining: 23.9s
17:	learn: 0.8892475	total: 3.2s	remaining: 23.4s
18:	learn: 0.8905482	total: 3.33s	remaining: 23s
19:	learn: 0.8917044	total: 3.48s	remaining: 

<catboost.core.CatBoostClassifier at 0x7918a1720730>

In [None]:
feature_importance = best_cat.get_feature_importance()

importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': feature_importance})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

                           Feature  Importance
3                           Weight   41.922776
0                           Gender   11.019532
2                           Height   10.010081
6                             FCVC    6.266600
1                              Age    5.896892
14                            CALC    4.558131
12                             FAF    3.638892
13                             TUE    3.573506
10                            CH2O    3.048229
7                              NCP    2.694437
15                          MTRANS    2.362774
8                             CAEC    2.072633
4   family_history_with_overweight    1.542637
5                             FAVC    1.075527
11                             SCC    0.300208
9                            SMOKE    0.017145


In [None]:
# SMOKE variable seems to be the least important variable, let's remove and see if accuracy improves on Kaggle
x_train_sel = x_train.drop('SMOKE', axis = 1)
x_test_sel = x_test.drop('SMOKE', axis = 1)
cat_feat_indices_sel = [x_train_sel.columns.get_loc(col) for col in cat_feat_x if col != 'SMOKE']
best_cat = CatBoostClassifier(cat_features = cat_feat_indices_sel,loss_function = 'MultiClass', eval_metric = "Accuracy", depth = 8, iterations = 150, learning_rate = 0.2, verbose = False)
best_cat.fit(x_train_sel,y_train)

# Improves accuracy score from 0.90281 to 0.90426

In [None]:
# Catboost
param_grid = {'iterations': [50, 100, 150,200],
    'depth': [4, 6, 8,10],
    'learning_rate': [0.01, 0.1, 0.2, .3]}
cat = CatBoostClassifier(cat_features = cat_feat_indices_sel,loss_function = 'MultiClass', eval_metric = "Accuracy",verbose = False)
grid_search = GridSearchCV(estimator=cat, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(x_train_sel, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'depth': 6, 'iterations': 200, 'learning_rate': 0.2}


In [None]:
# SMOKE and SCC variable seems to be the least important variable, let's remove and see if accuracy improves on Kaggle
x_train_sel = x_train.drop(['SMOKE','SCC'], axis = 1)
x_test_sel = x_test.drop(['SMOKE','SCC'], axis = 1)
cat_feat_indices_sel = [x_train_sel.columns.get_loc(col) for col in cat_feat_x if col != 'SMOKE' and col != 'SCC']
best_cat = CatBoostClassifier(cat_features = cat_feat_indices_sel,loss_function = 'MultiClass', eval_metric = "Accuracy", depth = 8, iterations = 150, learning_rate = 0.2, verbose = False)
best_cat.fit(x_train_sel,y_train)

# Reduces accuracy score from 0.90426 to 0.90137

<catboost.core.CatBoostClassifier at 0x7918a16c77c0>

In [None]:
y_pred = best_cat.predict(x_test_sel).flatten()
sample_submission = pd.DataFrame({'id': test_ids, 'NObeyesdad': y_pred})
sample_submission.to_csv('/content/drive/MyDrive/sample_submission.csv', index=False)