In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/data.csv')
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [3]:
df.head()

Unnamed: 0,Age,Gender,Weight (kg),Daily Water Intake (liters),Physical Activity Level,Weather,Hydration Level
0,56,Male,96,4.23,Moderate,Hot,Good
1,60,Male,105,3.95,High,Normal,Good
2,36,Male,68,2.39,Moderate,Cold,Good
3,19,Female,74,3.13,Moderate,Hot,Good
4,38,Male,77,2.11,Low,Normal,Poor


In [4]:
df["Gender"].dtypes

dtype('O')

In [5]:
numerical_cols_x = [col for col in x.columns if x[col].dtype != "O"]
categorical_cols_x = [col for col in x.columns if x[col].dtype == "O"]
categorical_col_y = y.name

In [6]:
for col in categorical_cols_x:
    print(f"{col}: {df[col].unique()}")

Gender: ['Male' 'Female']
Physical Activity Level: ['Moderate' 'High' 'Low']
Weather: ['Hot' 'Normal' 'Cold']


In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

numerical_pipeline_x = Pipeline([
    ("imputer",SimpleImputer(strategy="mean")),
    ('scaling',StandardScaler())
    ])

categorical_pipeline_x = Pipeline([
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("encode",OneHotEncoder())
])


preprocessor = ColumnTransformer(
    transformers=(
        ("numerical_features",numerical_pipeline_x,numerical_cols_x),
        ("categorical_features",categorical_pipeline_x,categorical_cols_x),
    )
)

In [8]:
x_preprocessed = preprocessor.fit_transform(x)

In [9]:
lb_y = LabelEncoder()
y_preprocessed = lb_y.fit_transform(y)

In [10]:
y_preprocessed

array([0, 0, 0, ..., 1, 0, 0], shape=(30000,))

In [11]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x_preprocessed,y_preprocessed,test_size=0.2,random_state=42,shuffle=True)

In [12]:
print(f"Train data shape: {x_train.shape}")
print(f"Test data shape: {x_test.shape}")

Train data shape: (24000, 11)
Test data shape: (6000, 11)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

models = {
    "LogisticRegression" : LogisticRegression(),
    "KNeighborsClassifier" : KNeighborsClassifier(),
    "LinearSVC" : LinearSVC(),
    "GaussianNB" : GaussianNB(),
    "DecisionTreeClassifier" : DecisionTreeClassifier(),
    "RandomForestClassifier" : RandomForestClassifier(),
    "GradientBoostingClassifier" : GradientBoostingClassifier(),
    "XGBClassifier" : XGBClassifier(),
    "LGBMClassifier" : LGBMClassifier()
}

In [14]:
x_test.shape

(6000, 11)

In [15]:
from sklearn.metrics import confusion_matrix,accuracy_score

best_model = ""
best_accuracy = 0

for model_name in models:
    model = models[model_name]
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)

    print("-"*100)
    print(f"--- {model_name} REPORT ---")
    print(f"Confusion matrix: {confusion_matrix(y_test,y_pred)}")
    model_accuracy_score = accuracy_score(y_test,y_pred)
    print(f"Accuracy score: {model_accuracy_score}")
    print("-"*100)

    if(model_accuracy_score > best_accuracy):
        best_model = model_name
        best_accuracy = model_accuracy_score

----------------------------------------------------------------------------------------------------
--- LogisticRegression REPORT ---
Confusion matrix: [[4787    1]
 [  15 1197]]
Accuracy score: 0.9973333333333333
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
--- KNeighborsClassifier REPORT ---
Confusion matrix: [[4714   74]
 [ 102 1110]]
Accuracy score: 0.9706666666666667
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
--- LinearSVC REPORT ---
Confusion matrix: [[4788    0]
 [   9 1203]]
Accuracy score: 0.9985
----------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------



In [16]:
print()
print(f"Best model for the data: {best_model}")
print(f"Accuracy score: {best_accuracy}")
print("-"*100)


Best model for the data: LinearSVC
Accuracy score: 0.9985
----------------------------------------------------------------------------------------------------
