# Smoking Dataset Model Training Notebook
Models allowed: Logistic Regression, SVM, Neural Network (MLP).

In [1]:
%pip install scikit-learn scipy pandas numpy -q


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from scipy.stats import loguniform, uniform
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('train_dataset.csv')
df.head()


Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,35,170,85,97.0,0.9,0.9,1,1,118,78,...,70,142,19.8,1,1.0,61,115,125,1,1
1,20,175,110,110.0,0.7,0.9,1,1,119,79,...,71,114,15.9,1,1.1,19,25,30,1,0
2,45,155,65,86.0,0.9,0.9,1,1,110,80,...,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,165,80,94.0,0.8,0.7,1,1,158,88,...,46,91,16.9,1,0.9,32,36,36,0,0
4,20,165,60,81.0,1.5,0.1,1,1,109,64,...,47,92,14.9,1,1.2,26,28,15,0,0


## Feature Engineering & Data Preparation

In [3]:
# ============================================================
# COMMON FEATURE ENGINEERING FUNCTIONS
# ============================================================

def add_common_features(data):
    """Base features shared by all models"""
    df = data.copy()
    
    # Body Composition
    df['BMI'] = df['weight(kg)'] / ((df['height(cm)'] / 100) ** 2)
    df['Waist_Height_ratio'] = df['waist(cm)'] / df['height(cm)']
    
    # Blood Pressure
    df['BP_ratio'] = df['systolic'] / (df['relaxation'] + 1)
    
    # Lipid Ratios
    df['Chol_HDL_ratio'] = df['Cholesterol'] / (df['HDL'] + 1)
    df['LDL_HDL_ratio'] = df['LDL'] / (df['HDL'] + 1)
    df['Trig_HDL_ratio'] = df['triglyceride'] / (df['HDL'] + 1)
    
    # Liver Function
    df['AST_ALT_ratio'] = df['AST'] / (df['ALT'] + 1)
    
    # Sensory
    df['eyesight_avg'] = (df['eyesight(left)'] + df['eyesight(right)']) / 2
    df['hearing_sum'] = df['hearing(left)'] + df['hearing(right)']
    
    return df

def add_features_lr(data):
    """Logistic Regression: Common features only (simpler model)"""
    return add_common_features(data)

def add_features_svm(data):
    """SVM: Common + additional features"""
    df = add_common_features(data)
    
    # Additional Body Features
    df['Waist_Weight_ratio'] = df['waist(cm)'] / df['weight(kg)']
    
    # Blood Pressure Extended
    df['pulse_pressure'] = df['systolic'] - df['relaxation']
    df['MAP'] = (df['systolic'] + 2 * df['relaxation']) / 3
    
    # Lipid Extended
    df['non_HDL_chol'] = df['Cholesterol'] - df['HDL']
    df['atherogenic_index'] = np.log10(df['triglyceride'] / (df['HDL'] + 1) + 1)
    
    # Liver Extended
    df['liver_enzyme_sum'] = df['AST'] + df['ALT'] + df['Gtp']
    df['GTP_ALT_ratio'] = df['Gtp'] / (df['ALT'] + 1)
    
    # Sensory Extended
    df['eyesight_diff'] = abs(df['eyesight(left)'] - df['eyesight(right)'])
    
    # Blood Features
    df['hemoglobin_BMI'] = df['hemoglobin'] / (df['BMI'] + 1)
    
    # Age Interactions
    df['age_hemoglobin'] = df['age'] * df['hemoglobin']
    df['age_BMI'] = df['age'] * df['BMI']
    df['age_systolic'] = df['age'] * df['systolic']
    
    # Composite Scores
    df['metabolic_risk'] = (df['BMI'] / 25) + (df['Trig_HDL_ratio'] / 3) + (df['fasting blood sugar'] / 100)
    df['cv_risk'] = (df['Chol_HDL_ratio'] / 4) + (df['systolic'] / 120) + (df['LDL'] / 100)
    
    return df

def add_features_mlp(data):
    """MLP: Most comprehensive features"""
    df = add_features_svm(data)  # Start with SVM features
    
    # Body Extended
    df['BSA'] = np.sqrt((df['height(cm)'] * df['weight(kg)']) / 3600)
    
    # Blood Pressure Extended
    df['hypertension_score'] = (df['systolic'] / 140) + (df['relaxation'] / 90)
    
    # Lipid Extended
    df['total_lipids'] = df['Cholesterol'] + df['triglyceride'] + df['LDL']
    
    # Liver Log-transformed
    df['log_GTP'] = np.log1p(df['Gtp'])
    df['log_ALT'] = np.log1p(df['ALT'])
    
    # Sensory Extended
    df['vision_score'] = (df['eyesight(left)'] + df['eyesight(right)']) * (1 + df['eyesight_diff'])
    
    # Blood Extended
    df['hemoglobin_norm'] = df['hemoglobin'] / 15
    
    # Age Extended
    df['age_cholesterol'] = df['age'] * df['Cholesterol']
    df['age_squared'] = df['age'] ** 2
    
    # Health Score
    df['health_score'] = (df['hemoglobin'] / 15) - (df['BMI'] / 30) - (df['Gtp'] / 50)
    
    # Dental
    df['has_dental_issues'] = (df['dental caries'] == 1).astype(int)
    
    return df

# ============================================================
# PREPARE DATA FOR ALL MODELS
# ============================================================

# Load test data once
test_df_raw = pd.read_csv('test_dataset.csv')

# Logistic Regression Data
df_lr = add_features_lr(df.copy())
X_lr = df_lr.drop('smoking', axis=1)
y_lr = df_lr['smoking']
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(
    X_lr, y_lr, test_size=0.2, random_state=42, stratify=y_lr
)
test_df_lr = add_features_lr(test_df_raw.copy())

# SVM Data
df_svm = add_features_svm(df.copy())
X_svm = df_svm.drop('smoking', axis=1)
y_svm = df_svm['smoking']
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(
    X_svm, y_svm, test_size=0.2, random_state=42, stratify=y_svm
)
test_df_svm = add_features_svm(test_df_raw.copy())

# MLP Data
df_mlp = add_features_mlp(df.copy())
X_mlp = df_mlp.drop('smoking', axis=1)
y_mlp = df_mlp['smoking']
X_train_mlp, X_test_mlp, y_train_mlp, y_test_mlp = train_test_split(
    X_mlp, y_mlp, test_size=0.2, random_state=42, stratify=y_mlp
)
test_df_mlp = add_features_mlp(test_df_raw.copy())

# Print Summary
print("="*70)
print("DATA PREPARATION SUMMARY")
print("="*70)
print(f"\n{'Model':<25} {'Train Shape':<20} {'Test Shape':<20} {'Features'}")
print("-"*70)
print(f"{'Logistic Regression':<25} {str(X_train_lr.shape):<20} {str(X_test_lr.shape):<20} {X_train_lr.shape[1]}")
print(f"{'SVM':<25} {str(X_train_svm.shape):<20} {str(X_test_svm.shape):<20} {X_train_svm.shape[1]}")
print(f"{'Neural Network (MLP)':<25} {str(X_train_mlp.shape):<20} {str(X_test_mlp.shape):<20} {X_train_mlp.shape[1]}")
print("-"*70)
print(f"\nClass Distribution: {y_lr.value_counts().to_dict()}")


DATA PREPARATION SUMMARY

Model                     Train Shape          Test Shape           Features
----------------------------------------------------------------------
Logistic Regression       (31187, 31)          (7797, 31)           31
SVM                       (31187, 45)          (7797, 45)           45
Neural Network (MLP)      (31187, 56)          (7797, 56)           56
----------------------------------------------------------------------

Class Distribution: {0: 24666, 1: 14318}


## Logistic Regression

In [4]:
# Pipeline with scaling and polynomial features
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('logreg', LogisticRegression(max_iter=3000, solver='saga', random_state=42))
])

# Hyperparameter distributions
param_dist_lr = {
    'logreg__C': loguniform(0.0001, 100),
    'logreg__penalty': ['l1', 'l2', 'elasticnet'],
    'logreg__l1_ratio': uniform(0, 1),
    'logreg__class_weight': [None, 'balanced'],
    'logreg__tol': loguniform(1e-6, 1e-2),
}

# RandomizedSearchCV
random_search_lr = RandomizedSearchCV(
    pipeline_lr, param_dist_lr, n_iter=10, cv=5, scoring='accuracy', 
    n_jobs=-1, verbose=1, random_state=42, return_train_score=True
)
random_search_lr.fit(X_train_lr, y_train_lr)

# Top 5 results
results_lr = pd.DataFrame(random_search_lr.cv_results_).sort_values('rank_test_score')
print("\n" + "="*60)
# print("=== LOGISTIC REGRESSION: Top 5 Parameter Combinations ===")
print("="*60)
for _, row in results_lr.head(5).iterrows():
    print(f"\nRank {int(row['rank_test_score'])}:")
    print(f"  C={row['param_logreg__C']:.6f}, penalty={row['param_logreg__penalty']}, "
          f"l1_ratio={row['param_logreg__l1_ratio']:.4f}")
    print(f"  CV Accuracy = {row['mean_test_score']:.4f} (+/- {row['std_test_score']:.4f})")

print(f"\nBest CV Accuracy: {random_search_lr.best_score_:.4f}")

# Evaluate and predict
best_logreg = random_search_lr.best_estimator_
y_pred_lr = best_logreg.predict(X_test_lr)
print(f"\nTest Accuracy: {accuracy_score(y_test_lr, y_pred_lr):.4f}")
print(classification_report(y_test_lr, y_pred_lr))

# Submission
submission_lr = pd.DataFrame({
    'id': range(len(test_df_lr)),
    'smoking': best_logreg.predict(test_df_lr)
})
submission_lr.to_csv('logistic_submission.csv', index=False)
print(f"Submission 'logistic_submission.csv' created: {len(submission_lr)} predictions")


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Rank 1:
  C=0.039054, penalty=elasticnet, l1_ratio=0.5248
  CV Accuracy = 0.7389 (+/- 0.0028)

Rank 2:
  C=0.047315, penalty=l1, l1_ratio=0.0581
  CV Accuracy = 0.7350 (+/- 0.0038)

Rank 3:
  C=0.017670, penalty=l1, l1_ratio=0.1834
  CV Accuracy = 0.7346 (+/- 0.0036)

Rank 4:
  C=0.001233, penalty=l2, l1_ratio=0.6175
  CV Accuracy = 0.7335 (+/- 0.0030)

Rank 5:
  C=1.771885, penalty=l2, l1_ratio=0.0564
  CV Accuracy = 0.7235 (+/- 0.0039)

Best CV Accuracy: 0.7389

Test Accuracy: 0.7352
              precision    recall  f1-score   support

           0       0.79      0.79      0.79      4933
           1       0.64      0.64      0.64      2864

    accuracy                           0.74      7797
   macro avg       0.71      0.71      0.71      7797
weighted avg       0.73      0.74      0.73      7797

Submission 'logistic_submission.csv' created: 16708 predictions


## Support Vector Machine (SVM)

In [6]:
# Pipeline
pipeline_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(random_state=42, cache_size=1000))
])

# Hyperparameter distributions
param_dist_svm = {
    'svm__C': loguniform(0.01, 100),
    'svm__kernel': ['rbf', 'poly', 'sigmoid', 'linear'],
    'svm__gamma': ['scale', 'auto'] + list(loguniform(1e-4, 1).rvs(8, random_state=42)),
    'svm__degree': [2, 3, 4],
    'svm__coef0': uniform(0, 1),
    'svm__class_weight': [None, 'balanced'],
    'svm__shrinking': [True, False],
    'svm__tol': loguniform(1e-5, 1e-2),
}

# RandomizedSearchCV
random_search_svm = RandomizedSearchCV(
    pipeline_svm, param_dist_svm, n_iter=1, cv=5, scoring='accuracy',
    n_jobs=-1, verbose=2, random_state=42, return_train_score=True
)

print("Starting SVM training (1 models, 5-fold CV)...")
random_search_svm.fit(X_train_svm, y_train_svm)

# Top 5 results
results_svm = pd.DataFrame(random_search_svm.cv_results_).sort_values('rank_test_score')
print("\n" + "="*60)
# print("=== SVM: Top 5 Parameter Combinations ===")
print("="*60)
for _, row in results_svm.head(5).iterrows():
    print(f"\nRank {int(row['rank_test_score'])}:")
    print(f"  C={row['param_svm__C']:.4f}, kernel={row['param_svm__kernel']}, gamma={row['param_svm__gamma']}")
    print(f"  CV Accuracy = {row['mean_test_score']:.4f} (+/- {row['std_test_score']:.4f})")

print(f"\nBest CV Accuracy: {random_search_svm.best_score_:.4f}")

# Evaluate and predict
best_svm = random_search_svm.best_estimator_
y_pred_svm = best_svm.predict(X_test_svm)
print(f"\nTest Accuracy: {accuracy_score(y_test_svm, y_pred_svm):.4f}")
print(classification_report(y_test_svm, y_pred_svm))

# Submission
submission_svm = pd.DataFrame({
    'id': range(len(test_df_svm)),
    'smoking': best_svm.predict(test_df_svm)
})
submission_svm.to_csv('svm_submission.csv', index=False)
print(f"Submission 'svm_submission.csv' created: {len(submission_svm)} predictions")


Starting SVM training (1 models, 5-fold CV)...
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Rank 1:
  C=0.3149, kernel=sigmoid, gamma=0.08471801418819976
  CV Accuracy = 0.6193 (+/- 0.0035)

Best CV Accuracy: 0.6193

Test Accuracy: 0.6054
              precision    recall  f1-score   support

           0       0.69      0.69      0.69      4933
           1       0.46      0.46      0.46      2864

    accuracy                           0.61      7797
   macro avg       0.58      0.58      0.58      7797
weighted avg       0.61      0.61      0.61      7797

Submission 'svm_submission.csv' created: 16708 predictions


## Neural Network (MLPClassifier)

In [7]:
 # Pipeline
pipeline_mlp = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPClassifier(random_state=42, early_stopping=True, validation_fraction=0.1))
])

# Hyperparameter distributions
param_dist_mlp = {
    'mlp__hidden_layer_sizes': [
        (64,), (128,), (256,),
        (64, 32), (128, 64), (256, 128),
        (128, 64, 32), (256, 128, 64),
        (64, 64), (128, 128), (100, 50, 25)
    ],
    'mlp__activation': ['relu', 'tanh', 'logistic'],
    'mlp__solver': ['adam', 'sgd'],
    'mlp__learning_rate': ['constant', 'adaptive', 'invscaling'],
    'mlp__learning_rate_init': loguniform(1e-4, 1e-1),
    'mlp__alpha': loguniform(1e-5, 1e-1),
    'mlp__batch_size': [32, 64, 128, 256],
    'mlp__max_iter': [300, 500, 700, 1000],
    'mlp__beta_1': uniform(0.85, 0.14),
    'mlp__beta_2': uniform(0.99, 0.009),
}

# RandomizedSearchCV
random_search_mlp = RandomizedSearchCV(
    pipeline_mlp, param_dist_mlp, n_iter=15, cv=5, scoring='accuracy',
    n_jobs=-1, verbose=2, random_state=42, return_train_score=True
)

print("Starting MLP training (15 models, 5-fold CV)...")
random_search_mlp.fit(X_train_mlp, y_train_mlp)

# Top 5 results
results_mlp = pd.DataFrame(random_search_mlp.cv_results_).sort_values('rank_test_score')
print("\n" + "="*60)
print("=== MLP: Top 5 Parameter Combinations ===")
print("="*60)
for _, row in results_mlp.head(5).iterrows():
    print(f"\nRank {int(row['rank_test_score'])}:")
    print(f"  Fitting 5 folds for each of 15 candidateslayers={row['param_mlp__hidden_layer_sizes']}, activation={row['param_mlp__activation']}")
    print(f"  solver={row['param_mlp__solver']}, lr_init={row['param_mlp__learning_rate_init']:.6f}")
    print(f"  CV Accuracy = {row['mean_test_score']:.4f} (+/- {row['std_test_score']:.4f})")

print(f"\nBest CV Accuracy: {random_search_mlp.best_score_:.4f}")

# Evaluate and predict
best_mlp = random_search_mlp.best_estimator_
y_pred_mlp = best_mlp.predict(X_test_mlp)
print(f"\nTest Accuracy: {accuracy_score(y_test_mlp, y_pred_mlp):.4f}")
print(classification_report(y_test_mlp, y_pred_mlp))

# Submission
submission_mlp = pd.DataFrame({
    'id': range(len(test_df_mlp)),
    'smoking': best_mlp.predict(test_df_mlp)
})
submission_mlp.to_csv('mlp_submission.csv', index=False)
print(f"Submission 'mlp_submission.csv' created: {len(submission_mlp)} predictions")


Starting MLP training (15 models, 5-fold CV)...
Fitting 5 folds for each of 15 candidates, totalling 75 fits

=== MLP: Top 5 Parameter Combinations ===

Rank 1:
  Fitting 5 folds for each of 15 candidateslayers=(256, 128), activation=logistic
  solver=adam, lr_init=0.004828
  CV Accuracy = 0.7553 (+/- 0.0009)

Rank 2:
  Fitting 5 folds for each of 15 candidateslayers=(64, 32), activation=tanh
  solver=adam, lr_init=0.003355
  CV Accuracy = 0.7536 (+/- 0.0026)

Rank 3:
  Fitting 5 folds for each of 15 candidateslayers=(128,), activation=relu
  solver=sgd, lr_init=0.000820
  CV Accuracy = 0.7528 (+/- 0.0052)

Rank 4:
  Fitting 5 folds for each of 15 candidateslayers=(100, 50, 25), activation=logistic
  solver=adam, lr_init=0.013200
  CV Accuracy = 0.7523 (+/- 0.0023)

Rank 5:
  Fitting 5 folds for each of 15 candidateslayers=(128, 64, 32), activation=relu
  solver=adam, lr_init=0.007411
  CV Accuracy = 0.7509 (+/- 0.0015)

Best CV Accuracy: 0.7553

Test Accuracy: 0.7530
              pre

## Model Comparison - Classification Scores Summary


In [8]:
# ============================================================================
#                    MODEL COMPARISON SUMMARY
# ============================================================================

# Collect all results
models_data = {
    'Logistic Regression': {
        'y_test': y_test_lr, 'y_pred': y_pred_lr, 'cv_score': random_search_lr.best_score_
    },
    'SVM': {
        'y_test': y_test_svm, 'y_pred': y_pred_svm, 'cv_score': random_search_svm.best_score_
    },
    'Neural Network (MLP)': {
        'y_test': y_test_mlp, 'y_pred': y_pred_mlp, 'cv_score': random_search_mlp.best_score_
    }
}

# Calculate metrics
model_results = {}
for name, data in models_data.items():
    model_results[name] = {
        'Test Accuracy': accuracy_score(data['y_test'], data['y_pred']),
        'Precision': precision_score(data['y_test'], data['y_pred'], average='weighted'),
        'Recall': recall_score(data['y_test'], data['y_pred'], average='weighted'),
        'F1-Score': f1_score(data['y_test'], data['y_pred'], average='weighted'),
        'CV Score': data['cv_score']
    }

# Print comparison table
print("="*80)
print("                    FINAL MODEL COMPARISON")
print("="*80)
print(f"\n{'MODEL':<25} {'ACCURACY':>12} {'PRECISION':>12} {'RECALL':>12} {'F1-SCORE':>12}")
print("-"*80)
for name, scores in model_results.items():
    print(f"{name:<25} {scores['Test Accuracy']:>12.4f} {scores['Precision']:>12.4f} "
          f"{scores['Recall']:>12.4f} {scores['F1-Score']:>12.4f}")
print("-"*80)

# Best model
best_model = max(model_results, key=lambda x: model_results[x]['Test Accuracy'])
print(f"\nüèÜ BEST MODEL: {best_model} (Accuracy: {model_results[best_model]['Test Accuracy']:.4f})")

# Summary DataFrame
summary_df = pd.DataFrame(model_results).T.round(4).sort_values('Test Accuracy', ascending=False)
print("\n" + "="*80)
print("SUMMARY TABLE")
print("="*80)
print(summary_df)

print("\n" + "="*80)
print("SUBMISSION FILES: logistic_submission.csv, svm_submission.csv, mlp_submission.csv")
print("="*80)


                    FINAL MODEL COMPARISON

MODEL                         ACCURACY    PRECISION       RECALL     F1-SCORE
--------------------------------------------------------------------------------
Logistic Regression             0.7352       0.7348       0.7352       0.7349
SVM                             0.6054       0.6054       0.6054       0.6054
Neural Network (MLP)            0.7530       0.7510       0.7530       0.7518
--------------------------------------------------------------------------------

üèÜ BEST MODEL: Neural Network (MLP) (Accuracy: 0.7530)

SUMMARY TABLE
                      Test Accuracy  Precision  Recall  F1-Score  CV Score
Neural Network (MLP)         0.7530     0.7510  0.7530    0.7518    0.7553
Logistic Regression          0.7352     0.7348  0.7352    0.7349    0.7389
SVM                          0.6054     0.6054  0.6054    0.6054    0.6193

SUBMISSION FILES: logistic_submission.csv, svm_submission.csv, mlp_submission.csv
