In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import catboost
from catboost import CatBoostClassifier

In [7]:
!pip show pandas

Name: pandas
Version: 1.5.3
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: The Pandas Development Team
Author-email: pandas-dev@python.org
License: BSD-3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, python-dateutil, pytz
Required-by: altair, arviz, bigframes, bokeh, bqplot, catboost, cmdstanpy, cufflinks, datascience, db-dtypes, dopamine-rl, fastai, geemap, geopandas, google-colab, gspread-dataframe, holoviews, ibis-framework, mizani, mlxtend, pandas-datareader, pandas-gbq, panel, pins, plotnine, prophet, pymc, seaborn, sklearn-pandas, statsmodels, vega-datasets, xarray, yfinance


In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train = pd.read_csv("train-obesity.csv")
df_test = pd.read_csv("test-obesity.csv")

In [None]:
# Store 'ids' for future use
train_ids = df_train['id']
test_ids = df_test['id']

# Remove ID
df_train = df_train.drop('id',axis = 1)
df_test = df_test.drop('id',axis = 1)

# Store Categorical Features

In [None]:
# Categorical Features
cat_feat = df_train.select_dtypes(exclude=['number']).columns.tolist()
cat_feat_x = [col for col in cat_feat if col != 'NObeyesdad' ]
cat_feat_indices = [df_train.columns.get_loc(col) for col in cat_feat_x]

## X and Y Split

In [None]:
x_train = df_train.loc[:, ~df_train.columns.isin(['NObeyesdad'])]
y_train = df_train['NObeyesdad']

x_test = df_test.loc[:, ~df_test.columns.isin(['NObeyesdad'])]

# Boosting


In [None]:
# Catboost
param_grid = {'iterations': [50, 100, 150],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2]}
cat = CatBoostClassifier(cat_features = cat_feat_indices,loss_function = 'MultiClass', eval_metric = "Accuracy",verbose = True)
grid_search = GridSearchCV(estimator=cat, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

In [None]:
best_cat = CatBoostClassifier(cat_features = cat_feat_indices,loss_function = 'MultiClass', eval_metric = "Accuracy", depth = 8, iterations = 150, learning_rate = 0.2, verbose = False)
best_cat.fit(x_train,y_train)

In [None]:
feature_importance = best_cat.get_feature_importance()

importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': feature_importance})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

                           Feature  Importance
3                           Weight   41.922776
0                           Gender   11.019532
2                           Height   10.010081
6                             FCVC    6.266600
1                              Age    5.896892
14                            CALC    4.558131
12                             FAF    3.638892
13                             TUE    3.573506
10                            CH2O    3.048229
7                              NCP    2.694437
15                          MTRANS    2.362774
8                             CAEC    2.072633
4   family_history_with_overweight    1.542637
5                             FAVC    1.075527
11                             SCC    0.300208
9                            SMOKE    0.017145


In [None]:
# SMOKE variable seems to be the least important variable, let's remove and see if accuracy improves on Kaggle
x_train_sel = x_train.drop('SMOKE', axis = 1)
x_test_sel = x_test.drop('SMOKE', axis = 1)
cat_feat_indices_sel = [x_train_sel.columns.get_loc(col) for col in cat_feat_x if col != 'SMOKE']
best_cat = CatBoostClassifier(cat_features = cat_feat_indices_sel,loss_function = 'MultiClass', eval_metric = "Accuracy", depth = 8, iterations = 150, learning_rate = 0.2, verbose = False)
best_cat.fit(x_train_sel,y_train)

# Improves accuracy score from 0.90281 to 0.90426

In [None]:
y_pred = best_cat.predict(x_test_sel).flatten()
sample_submission = pd.DataFrame({'id': test_ids, 'NObeyesdad': y_pred})
sample_submission.to_csv('/content/drive/MyDrive/sample_submission.csv', index=False)