# Smoking Dataset Model Training Notebook
Models allowed: Logistic Regression, SVM, Neural Network (MLP).

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

df = pd.read_csv('train_dataset.csv')
df.head()


Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,35,170,85,97.0,0.9,0.9,1,1,118,78,...,70,142,19.8,1,1.0,61,115,125,1,1
1,20,175,110,110.0,0.7,0.9,1,1,119,79,...,71,114,15.9,1,1.1,19,25,30,1,0
2,45,155,65,86.0,0.9,0.9,1,1,110,80,...,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,165,80,94.0,0.8,0.7,1,1,158,88,...,46,91,16.9,1,0.9,32,36,36,0,0
4,20,165,60,81.0,1.5,0.1,1,1,109,64,...,47,92,14.9,1,1.2,26,28,15,0,0


## Train-Test Split

In [2]:

X = df.drop('smoking', axis=1)
y = df['smoking']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape


((31187, 22), (7797, 22))

## Logistic Regression

In [3]:

logreg = LogisticRegression(max_iter=2000)
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.718737976144671
              precision    recall  f1-score   support

           0       0.76      0.81      0.78      4933
           1       0.63      0.56      0.59      2864

    accuracy                           0.72      7797
   macro avg       0.70      0.69      0.69      7797
weighted avg       0.71      0.72      0.71      7797



## Support Vector Machine (SVM)

In [4]:

svm = SVC(kernel='rbf', C=1.0, probability=True)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.7531101705784276
              precision    recall  f1-score   support

           0       0.80      0.81      0.81      4933
           1       0.67      0.66      0.66      2864

    accuracy                           0.75      7797
   macro avg       0.73      0.73      0.73      7797
weighted avg       0.75      0.75      0.75      7797



## Neural Network (MLPClassifier)

In [5]:

mlp = MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)

print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp))


MLP Accuracy: 0.7395151981531358
              precision    recall  f1-score   support

           0       0.80      0.78      0.79      4933
           1       0.64      0.67      0.66      2864

    accuracy                           0.74      7797
   macro avg       0.72      0.73      0.72      7797
weighted avg       0.74      0.74      0.74      7797

