In [196]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest, mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
import random
import xgboost as xgb

In [195]:
training_data = pd.read_csv('train.csv')

In [166]:
pd.set_option('display.max_columns',20)
training_data.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [167]:
training_data.shape

(20758, 18)

In [168]:
features = training_data.drop('NObeyesdad',axis=1)
features.shape

(20758, 17)

In [169]:
labels = training_data['NObeyesdad']
labels.shape

(20758,)

In [170]:
X_train,X_test,y_train,y_test = train_test_split(features,labels,random_state=50,test_size=0.3,shuffle=True)
X_train.shape

(14530, 17)

In [171]:
numerical_features = X_train.select_dtypes(include='number')
numerical_features.shape
y_train.shape

(14530,)

In [172]:
feature_selector = SelectFromModel(RandomForestClassifier(n_estimators=100))
feature_selector.fit(numerical_features,y_train)

In [173]:
selected_features = numerical_features.columns[(feature_selector.get_support())]
print(selected_features)

Index(['Age', 'Height', 'Weight', 'FCVC'], dtype='object')


In [174]:
categorical_features = X_train.select_dtypes(include='object')
print(categorical_features.columns)
oe = OrdinalEncoder()
lb = LabelEncoder()
oe.fit(categorical_features)
X_train_encoded = oe.transform(categorical_features)
print(type(X_train_encoded))
lb.fit(y_train)
y_train_encoded = lb.transform(y_train)
fs = SelectKBest(score_func=mutual_info_classif, k='all')
fs.fit(X_train_encoded, y_train_encoded)
X_train_categorical = fs.transform(X_train_encoded)

Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS'],
      dtype='object')
<class 'numpy.ndarray'>


In [175]:
print(fs.scores_)

[0.25447953 0.17015072 0.0521735  0.14365215 0.01198276 0.01817847
 0.10963087 0.06352682]


In [205]:
selected_features = training_data[['Age','Weight','Height','FCVC','Gender','family_history_with_overweight','CAEC','CALC','MTRANS','FAVC']]

In [206]:
label = training_data['NObeyesdad']

In [207]:
categorical_features = selected_features.select_dtypes(include='object')
print(categorical_features.columns)
oe = OrdinalEncoder()
lb = LabelEncoder()
oe.fit(categorical_features)
categorical_encoded = oe.transform(categorical_features)
print(type(categorical_encoded))
lb.fit(label)
label_encoded = lb.transform(label)
print(type(label_encoded))
categorical_encoded.shape

Index(['Gender', 'family_history_with_overweight', 'CAEC', 'CALC', 'MTRANS',
       'FAVC'],
      dtype='object')
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


(20758, 6)

In [208]:
numerical_features = selected_features.select_dtypes(include='number')
print(numerical_features.columns)
numerical_features.shape
numerical_final = numerical_features.to_numpy()
numerical_final.shape

Index(['Age', 'Weight', 'Height', 'FCVC'], dtype='object')


(20758, 4)

In [209]:
final_features = np.concatenate((categorical_encoded,numerical_final), axis=1)

In [210]:
X_train,X_test,y_train,y_test = train_test_split(final_features,label_encoded,random_state=50,test_size=0.3,shuffle=True)

In [214]:
model = RandomForestClassifier(n_estimators=200)
model.fit(X_train,y_train)
pred_values = model.predict(X_test)
accuracy = accuracy_score(y_test, pred_values)

In [215]:
print(accuracy)

0.8834296724470135


In [130]:
confusion_matrix(y_test, pred_values)

array([[ 669,   63,    1,    0,    0,    3,    3],
       [  63,  744,    1,    0,    0,   83,    7],
       [   2,    2,  713,   50,    2,   20,   86],
       [   0,    0,   45,  955,    1,    0,    2],
       [   0,    0,    1,    0, 1207,    1,    0],
       [   5,   83,   22,    0,    0,  540,  102],
       [   0,    8,   82,    7,    0,   97,  558]], dtype=int64)

In [193]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,Male,26.899886,1.848294,120.644178,yes,yes,2.938616,3.0,Sometimes,no,2.825629,no,0.8554,0.0,Sometimes,Public_Transportation
1,20759,Female,21.0,1.6,66.0,yes,yes,2.0,1.0,Sometimes,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation
2,20760,Female,26.0,1.643355,111.600553,yes,yes,3.0,3.0,Sometimes,no,2.621877,no,0.0,0.250502,Sometimes,Public_Transportation
3,20761,Male,20.979254,1.553127,103.669116,yes,yes,2.0,2.977909,Sometimes,no,2.786417,no,0.094851,0.0,Sometimes,Public_Transportation
4,20762,Female,26.0,1.627396,104.835346,yes,yes,3.0,3.0,Sometimes,no,2.653531,no,0.0,0.741069,Sometimes,Public_Transportation


In [217]:
xgb_classifier = xgb.XGBClassifier(n_estimators=100, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=3, enable_categorical=True)
xgb_classifier.fit(X_train, y_train)

In [218]:
pred_values = xgb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, pred_values)

In [219]:
print(accuracy)

0.8853564547206165
