In [1]:
!pip install -q kaggle xgboost

In [2]:
import zipfile
import os

zip_path = "/content/smokerstatusdataset.zip"   # your uploaded file
extract_path = "/content/smoker_data"

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_path)

print("Extracted files:", os.listdir(extract_path))


Extracted files: ['train_dataset.csv', 'test_dataset.csv']


In [3]:
import pandas as pd
import os

# find CSV files in extracted folder
csv_files = [f for f in os.listdir(extract_path) if f.endswith(".csv")]
csv_files


['train_dataset.csv', 'test_dataset.csv']

In [4]:
df = pd.read_csv(os.path.join(extract_path, csv_files[0]))
df.head()


Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,35,170,85,97.0,0.9,0.9,1,1,118,78,...,70,142,19.8,1,1.0,61,115,125,1,1
1,20,175,110,110.0,0.7,0.9,1,1,119,79,...,71,114,15.9,1,1.1,19,25,30,1,0
2,45,155,65,86.0,0.9,0.9,1,1,110,80,...,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,165,80,94.0,0.8,0.7,1,1,158,88,...,46,91,16.9,1,0.9,32,36,36,0,0
4,20,165,60,81.0,1.5,0.1,1,1,109,64,...,47,92,14.9,1,1.2,26,28,15,0,0


In [5]:
X = df.drop('smoking', axis=1)
y = df['smoking']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
import zipfile
import os

zip_path = "/content/test_dataset.csv.zip"   # change if filename is different
extract_path = "/content/test_data"

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_path)

os.listdir(extract_path)


['test_dataset.csv']

In [9]:
import pandas as pd

test_df = pd.read_csv('test_dataset.csv.zip')  # replace with real name
test_df.head()
test_df.columns


Index(['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries'],
      dtype='object')

In [10]:
X_test_external = test_df.copy()

# apply the same scaler
X_test_external_scaled = scaler.transform(X_test_external)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

log_reg = LogisticRegression(class_weight='balanced', max_iter=500)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)

# Metrics
lr_acc = accuracy_score(y_test, y_pred_lr)
lr_prec = precision_score(y_test, y_pred_lr)
lr_rec = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)
lr_cm = confusion_matrix(y_test, y_pred_lr)

lr_acc, lr_prec, lr_rec, lr_f1, lr_cm


(0.7127100166730794,
 0.581081081081081,
 0.7807262569832403,
 0.66626936829559,
 array([[3321, 1612],
        [ 628, 2236]]))

In [12]:
pred_lr = log_reg.predict(X_test_external_scaled)


In [15]:
from sklearn.linear_model import LogisticRegression

best_lr = LogisticRegression(
    C=0.01,
    penalty='l1',
    solver='liblinear',
    class_weight='balanced',
    max_iter=1000
)


In [14]:
from sklearn.svm import SVC

svm_linear = SVC(kernel='linear', class_weight='balanced')
svm_linear.fit(X_train_scaled, y_train)

y_pred_svm_linear = svm_linear.predict(X_test_scaled)

svm_lin_acc = accuracy_score(y_test, y_pred_svm_linear)
svm_lin_prec = precision_score(y_test, y_pred_svm_linear)
svm_lin_rec = recall_score(y_test, y_pred_svm_linear)
svm_lin_f1 = f1_score(y_test, y_pred_svm_linear)
svm_lin_cm = confusion_matrix(y_test, y_pred_svm_linear)

svm_lin_acc, svm_lin_prec, svm_lin_rec, svm_lin_f1, svm_lin_cm


(0.7036039502372707,
 0.5631134444190824,
 0.8613826815642458,
 0.6810213940648723,
 array([[3019, 1914],
        [ 397, 2467]]))

In [16]:
pred_svm_linear = svm_linear.predict(X_test_external_scaled)


In [17]:
from sklearn.svm import SVC

best_svm_linear = SVC(
    kernel='linear',
    C=0.1,
    class_weight='balanced'
)


In [18]:
svm_rbf = SVC(kernel='rbf', class_weight='balanced')
svm_rbf.fit(X_train_scaled, y_train)

y_pred_svm_rbf = svm_rbf.predict(X_test_scaled)

svm_rbf_acc = accuracy_score(y_test, y_pred_svm_rbf)
svm_rbf_prec = precision_score(y_test, y_pred_svm_rbf)
svm_rbf_rec = recall_score(y_test, y_pred_svm_rbf)
svm_rbf_f1 = f1_score(y_test, y_pred_svm_rbf)
svm_rbf_cm = confusion_matrix(y_test, y_pred_svm_rbf)

svm_rbf_acc, svm_rbf_prec, svm_rbf_rec, svm_rbf_f1, svm_rbf_cm


(0.7143773246120303,
 0.5721404303510759,
 0.8819832402234636,
 0.69405138068416,
 array([[3044, 1889],
        [ 338, 2526]]))

In [19]:
pred_svm_rbf = svm_rbf.predict(X_test_external_scaled)


In [20]:
best_svm_rbf = SVC(
    kernel='rbf',
    C=1,
    gamma=0.1,
    class_weight='balanced'
)


In [21]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(64, 32),
                    activation='relu',
                    max_iter=300,
                    random_state=42)

mlp.fit(X_train_scaled, y_train)

y_pred_mlp = mlp.predict(X_test_scaled)

mlp_acc = accuracy_score(y_test, y_pred_mlp)
mlp_prec = precision_score(y_test, y_pred_mlp)
mlp_rec = recall_score(y_test, y_pred_mlp)
mlp_f1 = f1_score(y_test, y_pred_mlp)
mlp_cm = confusion_matrix(y_test, y_pred_mlp)

mlp_acc, mlp_prec, mlp_rec, mlp_f1, mlp_cm




(0.738873925868924,
 0.6371769383697813,
 0.6714385474860335,
 0.6538592315538932,
 array([[3838, 1095],
        [ 941, 1923]]))

In [22]:
pred_mlp = mlp.predict(X_test_external_scaled)


In [23]:
from sklearn.neural_network import MLPClassifier

best_mlp = MLPClassifier(
    hidden_layer_sizes=(64,),
    alpha=0.01,
    learning_rate_init=0.01,
    max_iter=500,
    random_state=42
)


In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import pandas as pd

# -----------------------------
# Logistic Regression (Normal vs Tuned)
# -----------------------------
y_pred_lr_norm = y_pred_lr
y_pred_lr_tuned = best_lr.predict(X_test_scaled)

# -----------------------------
# SVM LINEAR (Normal vs Tuned)
# -----------------------------
y_pred_svm_linear_norm = svm_linear.predict(X_test_scaled)
y_pred_svm_linear_tuned = best_svm_linear.predict(X_test_scaled)

# -----------------------------
# SVM RBF (Normal vs Tuned)
# -----------------------------
y_pred_svm_rbf_norm = svm_rbf.predict(X_test_scaled)
y_pred_svm_rbf_tuned = best_svm_rbf.predict(X_test_scaled)

# -----------------------------
# MLP (Normal vs Tuned)
# -----------------------------
y_pred_mlp_norm = mlp.predict(X_test_scaled)
y_pred_mlp_tuned = best_mlp.predict(X_test_scaled)


# ---------------------------------------------------
# FUNCTION TO GET ALL METRICS
# ---------------------------------------------------
def metrics(y_true, y_pred):
    return [
        accuracy_score(y_true, y_pred),
        precision_score(y_true, y_pred),
        recall_score(y_true, y_pred),
        f1_score(y_true, y_pred),
    ]

# ---------------------------------------------------
# BUILD INTERNAL COMPARISON TABLE
# ---------------------------------------------------
internal_df = pd.DataFrame([
    ["LR Normal"] + metrics(y_test, y_pred_lr_norm),
    ["LR Tuned"]  + metrics(y_test, y_pred_lr_tuned),

    ["SVM Linear Normal"] + metrics(y_test, y_pred_svm_linear_norm),
    ["SVM Linear Tuned"]  + metrics(y_test, y_pred_svm_linear_tuned),

    ["SVM RBF Normal"] + metrics(y_test, y_pred_svm_rbf_norm),
    ["SVM RBF Tuned"]  + metrics(y_test, y_pred_svm_rbf_tuned),

    ["MLP Normal"] + metrics(y_test, y_pred_mlp_norm),
    ["MLP Tuned"]  + metrics(y_test, y_pred_mlp_tuned),
],

columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score"]
)

print("\n================ INTERNAL TEST SET RESULTS ================\n")
display(internal_df)






Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,LR Normal,0.71271,0.581081,0.780726,0.666269
1,LR Tuned,0.709247,0.576362,0.786662,0.665289
2,SVM Linear Normal,0.703604,0.563113,0.861383,0.681021
3,SVM Linear Tuned,0.703219,0.562699,0.861732,0.680828
4,SVM RBF Normal,0.714377,0.57214,0.881983,0.694051
5,SVM RBF Tuned,0.725022,0.586207,0.854749,0.695455
6,MLP Normal,0.738874,0.637177,0.671439,0.653859
7,MLP Tuned,0.745543,0.640485,0.700419,0.669113


In [35]:
# -----------------------------
# Logistic Regression
# -----------------------------
pred_lr_norm = pred_lr
pred_lr_tuned = best_lr.predict(X_test_external_scaled)

# -----------------------------
# SVM LINEAR
# -----------------------------
pred_svm_linear_norm = svm_linear.predict(X_test_external_scaled)
pred_svm_linear_tuned = best_svm_linear.predict(X_test_external_scaled)

# -----------------------------
# SVM RBF
# -----------------------------
pred_svm_rbf_norm = svm_rbf.predict(X_test_external_scaled)
pred_svm_rbf_tuned = best_svm_rbf.predict(X_test_external_scaled)

# -----------------------------
# MLP
# -----------------------------
pred_mlp_norm = mlp.predict(X_test_external_scaled)
pred_mlp_tuned = best_mlp.predict(X_test_external_scaled)


external_df = pd.DataFrame([
    ["LR Normal"] + list(np.unique(pred_lr_norm, return_counts=True)[1]),
    ["LR Tuned"]  + list(np.unique(pred_lr_tuned, return_counts=True)[1]),

    ["SVM Linear Normal"] + list(np.unique(pred_svm_linear_norm, return_counts=True)[1]),
    ["SVM Linear Tuned"]  + list(np.unique(pred_svm_linear_tuned, return_counts=True)[1]),

    ["SVM RBF Normal"] + list(np.unique(pred_svm_rbf_norm, return_counts=True)[1]),
    ["SVM RBF Tuned"]  + list(np.unique(pred_svm_rbf_tuned, return_counts=True)[1]),

    ["MLP Normal"] + list(np.unique(pred_mlp_norm, return_counts=True)[1]),
    ["MLP Tuned"]  + list(np.unique(pred_mlp_tuned, return_counts=True)[1]),
],

columns=["Model", "Pred 0 Count", "Pred 1 Count"]
)

print("\n================ EXTERNAL TEST PREDICTION COUNTS ================\n")
display(external_df)






Unnamed: 0,Model,Pred 0 Count,Pred 1 Count
0,LR Normal,8613,8095
1,LR Tuned,8496,8212
2,SVM Linear Normal,7523,9185
3,SVM Linear Tuned,7522,9186
4,SVM RBF Normal,7406,9302
5,SVM RBF Tuned,7898,8810
6,MLP Normal,10387,6321
7,MLP Tuned,10117,6591


In [36]:
# import numpy as np
# import pandas as pd
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# # ============================================================
# # 1. REBUILD TUNED MODELS WITH BEST PARAMETERS
# # ============================================================

# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.neural_network import MLPClassifier

# best_lr = LogisticRegression(
#     C=0.01,
#     penalty='l1',
#     solver='liblinear',
#     class_weight='balanced',
#     max_iter=1000
# )

# best_svm_linear = SVC(
#     kernel='linear',
#     C=0.1,
#     class_weight='balanced'
# )

# best_svm_rbf = SVC(
#     kernel='rbf',
#     C=1,
#     gamma=0.1,
#     class_weight='balanced'
# )

# best_mlp = MLPClassifier(
#     hidden_layer_sizes=(64,),
#     alpha=0.01,
#     learning_rate_init=0.01,
#     max_iter=500,
#     random_state=42
# )

# # ============================================================
# # 2. FIT TUNED MODELS
# # ============================================================

# best_lr.fit(X_train_scaled, y_train)
# best_svm_linear.fit(X_train_scaled, y_train)
# best_svm_rbf.fit(X_train_scaled, y_train)
# best_mlp.fit(X_train_scaled, y_train)

# # ============================================================
# # 3. PREDICT ON INTERNAL TEST DATA
# # ============================================================

# y_pred_lr_norm = log_reg.predict(X_test_scaled)
# y_pred_lr_tuned = best_lr.predict(X_test_scaled)

# y_pred_svm_linear_norm = svm_linear.predict(X_test_scaled)
# y_pred_svm_linear_tuned = best_svm_linear.predict(X_test_scaled)

# y_pred_svm_rbf_norm = svm_rbf.predict(X_test_scaled)
# y_pred_svm_rbf_tuned = best_svm_rbf.predict(X_test_scaled)

# y_pred_mlp_norm = mlp.predict(X_test_scaled)
# y_pred_mlp_tuned = best_mlp.predict(X_test_scaled)

# # ============================================================
# # 4. INTERNAL METRIC FUNCTION
# # ============================================================

# def get_metrics(y_true, y_pred):
#     return [
#         accuracy_score(y_true, y_pred),
#         precision_score(y_true, y_pred),
#         recall_score(y_true, y_pred),
#         f1_score(y_true, y_pred),
#         confusion_matrix(y_true, y_pred)
#     ]

# # ============================================================
# # 5. INTERNAL COMPARISON TABLE
# # ============================================================

# internal_results = pd.DataFrame({
#     "Model": [
#         "LR Normal", "LR Tuned",
#         "SVM Linear Normal", "SVM Linear Tuned",
#         "SVM RBF Normal", "SVM RBF Tuned",
#         "MLP Normal", "MLP Tuned"
#     ],
#     "Accuracy": [
#         get_metrics(y_test, y_pred_lr_norm)[0],
#         get_metrics(y_test, y_pred_lr_tuned)[0],
#         get_metrics(y_test, y_pred_svm_linear_norm)[0],
#         get_metrics(y_test, y_pred_svm_linear_tuned)[0],
#         get_metrics(y_test, y_pred_svm_rbf_norm)[0],
#         get_metrics(y_test, y_pred_svm_rbf_tuned)[0],
#         get_metrics(y_test, y_pred_mlp_norm)[0],
#         get_metrics(y_test, y_pred_mlp_tuned)[0],
#     ],
#     "Precision": [
#         get_metrics(y_test, y_pred_lr_norm)[1],
#         get_metrics(y_test, y_pred_lr_tuned)[1],
#         get_metrics(y_test, y_pred_svm_linear_norm)[1],
#         get_metrics(y_test, y_pred_svm_linear_tuned)[1],
#         get_metrics(y_test, y_pred_svm_rbf_norm)[1],
#         get_metrics(y_test, y_pred_svm_rbf_tuned)[1],
#         get_metrics(y_test, y_pred_mlp_norm)[1],
#         get_metrics(y_test, y_pred_mlp_tuned)[1],
#     ],
#     "Recall": [
#         get_metrics(y_test, y_pred_lr_norm)[2],
#         get_metrics(y_test, y_pred_lr_tuned)[2],
#         get_metrics(y_test, y_pred_svm_linear_norm)[2],
#         get_metrics(y_test, y_pred_svm_linear_tuned)[2],
#         get_metrics(y_test, y_pred_svm_rbf_norm)[2],
#         get_metrics(y_test, y_pred_svm_rbf_tuned)[2],
#         get_metrics(y_test, y_pred_mlp_norm)[2],
#         get_metrics(y_test, y_pred_mlp_tuned)[2],
#     ],
#     "F1 Score": [
#         get_metrics(y_test, y_pred_lr_norm)[3],
#         get_metrics(y_test, y_pred_lr_tuned)[3],
#         get_metrics(y_test, y_pred_svm_linear_norm)[3],
#         get_metrics(y_test, y_pred_svm_linear_tuned)[3],
#         get_metrics(y_test, y_pred_svm_rbf_norm)[3],
#         get_metrics(y_test, y_pred_svm_rbf_tuned)[3],
#         get_metrics(y_test, y_pred_mlp_norm)[3],
#         get_metrics(y_test, y_pred_mlp_tuned)[3],
#     ]
# })

# internal_results

# # ============================================================
# # 6. EXTERNAL TEST PREDICTIONS (COUNTS)
# # ============================================================

# pred_lr_norm = log_reg.predict(X_test_external_scaled)
# pred_lr_tuned = best_lr.predict(X_test_external_scaled)

# pred_svm_linear_norm = svm_linear.predict(X_test_external_scaled)
# pred_svm_linear_tuned = best_svm_linear.predict(X_test_external_scaled)

# pred_svm_rbf_norm = svm_rbf.predict(X_test_external_scaled)
# pred_svm_rbf_tuned = best_svm_rbf.predict(X_test_external_scaled)

# pred_mlp_norm = mlp.predict(X_test_external_scaled)
# pred_mlp_tuned = best_mlp.predict(X_test_external_scaled)

# external_results = pd.DataFrame({
#     "Model": [
#         "LR Normal", "LR Tuned",
#         "SVM Linear Normal", "SVM Linear Tuned",
#         "SVM RBF Normal", "SVM RBF Tuned",
#         "MLP Normal", "MLP Tuned"
#     ],
#     "Pred 0 Count": [
#         np.unique(pred_lr_norm, return_counts=True)[1][0],
#         np.unique(pred_lr_tuned, return_counts=True)[1][0],
#         np.unique(pred_svm_linear_norm, return_counts=True)[1][0],
#         np.unique(pred_svm_linear_tuned, return_counts=True)[1][0],
#         np.unique(pred_svm_rbf_norm, return_counts=True)[1][0],
#         np.unique(pred_svm_rbf_tuned, return_counts=True)[1][0],
#         np.unique(pred_mlp_norm, return_counts=True)[1][0],
#         np.unique(pred_mlp_tuned, return_counts=True)[1][0],
#     ],
#     "Pred 1 Count": [
#         np.unique(pred_lr_norm, return_counts=True)[1][1],
#         np.unique(pred_lr_tuned, return_counts=True)[1][1],
#         np.unique(pred_svm_linear_norm, return_counts=True)[1][1],
#         np.unique(pred_svm_linear_tuned, return_counts=True)[1][1],
#         np.unique(pred_svm_rbf_norm, return_counts=True)[1][1],
#         np.unique(pred_svm_rbf_tuned, return_counts=True)[1][1],
#         np.unique(pred_mlp_norm, return_counts=True)[1][1],
#         np.unique(pred_mlp_tuned, return_counts=True)[1][1],
#     ]
# })

# external_results
