In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn import linear_model, preprocessing

from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import accuracy_score

In [2]:
data_raw = pd.read_csv("TrainingWiDS2021.csv")
data_raw = data_raw.sample(n=5000, random_state=42)
data_selected = data_raw[["age", "bmi", "gender", "height", "diabetes_mellitus"]]
data_selected.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 71328 to 76569
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                4806 non-null   float64
 1   bmi                4820 non-null   float64
 2   gender             4999 non-null   object 
 3   height             4916 non-null   float64
 4   diabetes_mellitus  5000 non-null   int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 234.4+ KB


In [3]:
# Note that sklearn decision tree does not work with object data. 

# need to encode and convert this to categorical

# I was running into an issue where it was being encoded to 0 and 1 but they were still object type variable which will give you an error

# always check the error messages and go back to fix your code
le= LabelEncoder()

# also note if you run this line twice it will give ou an error

In [4]:
data_selected.loc[:,"gender"] = le.fit_transform(data_selected["gender"])

data_selected.loc[:,"gender"] = data_selected["gender"].astype("category")

In [5]:
data_final =data_selected[(data_selected.age>=20) & (data_selected.age<=65) & 
                          (data_selected.bmi>=25) & (data_selected.bmi<45) &
                          (data_selected.height>100) & (data_selected.height<200)].dropna()

In [6]:
data_final.isna().sum()

age                  0
bmi                  0
gender               0
height               0
diabetes_mellitus    0
dtype: int64

In [7]:
# Now lets create test train split
y = data_final["diabetes_mellitus"]
X = data_final.drop(columns=["diabetes_mellitus"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)

In [8]:
# our base estimator is going to be the same decision tree but pruned
base_estimator = DecisionTreeClassifier(max_depth=3, min_samples_leaf=10, min_samples_split=5, random_state=99)

# lets play around with multiple estimators

# two estimators
BG_clf_2est = BaggingClassifier(estimator = base_estimator, n_estimators=2, random_state=99)
BG_clf_2est.fit(X_train, y_train)

# five estimators
BG_clf_5est = BaggingClassifier(estimator = base_estimator, n_estimators=5, random_state=99)
BG_clf_5est.fit(X_train, y_train)

# ten estimators
BG_clf_10est = BaggingClassifier(estimator = base_estimator, n_estimators=10, random_state=99)
BG_clf_10est.fit(X_train, y_train)

# twenty five estimators
BG_clf_25est = BaggingClassifier(estimator = base_estimator, n_estimators=25, random_state=99)
BG_clf_25est.fit(X_train, y_train)

# fifty estimators
BG_clf_50est = BaggingClassifier(estimator = base_estimator, n_estimators=50, random_state=99)
BG_clf_50est.fit(X_train, y_train)



In [11]:
print(f"Accuracy score for two estimators: {accuracy_score(y_test, BG_clf_2est.predict(X_test))}")

print(f"Accuracy score for five estimators: {accuracy_score(y_test, BG_clf_5est.predict(X_test))}")

print(f"Accuracy score for ten estimators: {accuracy_score(y_test, BG_clf_10est.predict(X_test))}")

print(f"Accuracy score for twentyfive estimators: {accuracy_score(y_test, BG_clf_25est.predict(X_test))}")

print(f"Accuracy score for fifty estimators: {accuracy_score(y_test, BG_clf_50est.predict(X_test))}")

Accuracy score for two estimators: 0.7846153846153846
Accuracy score for five estimators: 0.7868131868131868
Accuracy score for ten estimators: 0.789010989010989
Accuracy score for twentyfive estimators: 0.789010989010989
Accuracy score for fifty estimators: 0.789010989010989


In [None]:
# not much improvement here

In [12]:
# lets try unpruned trees (usually we dont prune when using ensemble methods)

# our base estimator is going to be the same decision tree but pruned
base_estimator = DecisionTreeClassifier( random_state=99)

# lets play around with multiple estimators

# two estimators
BG_clf_2est_unp = BaggingClassifier(estimator = base_estimator, n_estimators=2, random_state=99)
BG_clf_2est_unp .fit(X_train, y_train)

# five estimators
BG_clf_5est_unp  = BaggingClassifier(estimator = base_estimator, n_estimators=5, random_state=99)
BG_clf_5est_unp .fit(X_train, y_train)

# ten estimators
BG_clf_10est_unp  = BaggingClassifier(estimator = base_estimator, n_estimators=10, random_state=99)
BG_clf_10est_unp .fit(X_train, y_train)

# twenty five estimators
BG_clf_25est_unp  = BaggingClassifier(estimator = base_estimator, n_estimators=25, random_state=99)
BG_clf_25est_unp .fit(X_train, y_train)

# fifty estimators
BG_clf_50est_unp  = BaggingClassifier(estimator = base_estimator, n_estimators=50, random_state=99)
BG_clf_50est_unp .fit(X_train, y_train)


In [13]:
print(f"Accuracy score for two estimators: {accuracy_score(y_test, BG_clf_2est_unp.predict(X_test))}")

print(f"Accuracy score for five estimators: {accuracy_score(y_test, BG_clf_5est_unp.predict(X_test))}")

print(f"Accuracy score for ten estimators: {accuracy_score(y_test, BG_clf_10est_unp.predict(X_test))}")

print(f"Accuracy score for twentyfive estimators: {accuracy_score(y_test, BG_clf_25est_unp.predict(X_test))}")

print(f"Accuracy score for fifty estimators: {accuracy_score(y_test, BG_clf_50est_unp.predict(X_test))}")

Accuracy score for two estimators: 0.7472527472527473
Accuracy score for five estimators: 0.6813186813186813
Accuracy score for ten estimators: 0.7164835164835165
Accuracy score for twentyfive estimators: 0.7186813186813187
Accuracy score for fifty estimators: 0.7164835164835165


In [14]:
# results are still pretty bad. much worse acutally

# the predictors that we are using are not doing a good job.