In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Classification models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# XGBoost
from xgboost import XGBClassifier, XGBRegressor

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, mean_absolute_error, r2_score
)

import joblib



In [23]:
df=pd.read_csv('titanic_1000_rows.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,1,male,57.0,1,0,3.37,S
1,2,1,3,male,72.0,4,1,22.74,
2,3,1,3,female,7.0,2,3,15.11,S
3,4,0,3,male,78.0,4,3,28.49,S
4,5,0,3,male,56.0,4,2,49.14,C


In [24]:
TARGET = "Survived"

X = df.drop(columns=[TARGET])
y = df[TARGET]


In [25]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

print("Numerical:", num_cols)
print("Categorical:", cat_cols)


Numerical: Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
Categorical: Index(['Sex', 'Embarked'], dtype='object')


In [26]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])


In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [28]:
def detect_problem_type(y):
    if y.dtype == "object":
        return "classification"
    if y.nunique() <= 20:
        return "classification"
    return "regression"

problem_type = detect_problem_type(y)
problem_type


'classification'

In [32]:
if problem_type == "classification":
    models = {
        "Logistic Regression": {
            "model": LogisticRegression(max_iter=2000, solver="liblinear"),
            "params": {
                "model__C": [0.01, 0.1, 1, 5, 10],
                "model__penalty": ["l1", "l2"]
            }
        },

        "Decision Tree": {
            "model": DecisionTreeClassifier(random_state=42),
            "params": {
                "model__max_depth": [None, 5, 10, 20],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 2, 4],
                "model__criterion": ["gini", "entropy"]
            }
        },

        "Random Forest": {
            "model": RandomForestClassifier(random_state=42),
            "params": {
                "model__n_estimators": [200, 400],
                "model__max_depth": [None, 10, 20],
                "model__min_samples_split": [2, 5],
                "model__min_samples_leaf": [1, 2],
                "model__max_features": ["sqrt", "log2"]
            }
        },

        "KNN": {
            "model": KNeighborsClassifier(),
            "params": {
                "model__n_neighbors": [3, 5, 7, 9],
                "model__weights": ["uniform", "distance"],
                "model__metric": ["euclidean", "manhattan"]
            }
        },

        "Naive Bayes": {
            "model": GaussianNB(),
            "params": {
                "model__var_smoothing": [1e-9, 1e-8, 1e-7]
            }
        },

        "XGBoost": {
            "model": XGBClassifier(
                eval_metric="logloss",
                use_label_encoder=False,
                random_state=42
            ),
            "params": {
                "model__n_estimators": [200, 400],
                "model__max_depth": [3, 5, 7],
                "model__learning_rate": [0.01, 0.05, 0.1],
                "model__subsample": [0.8, 1.0],
                "model__colsample_bytree": [0.8, 1.0]
            }
        }
    }
else:
    models = {
        "Linear Regression": {
            "model": LinearRegression(),
            "params": {}
        },

        "Polynomial Regression": {
            "model": Pipeline([
                ("poly", PolynomialFeatures(include_bias=False)),
                ("lr", LinearRegression())
            ]),
            "params": {
                "model__poly__degree": [2, 3]
            }
        },

        "Decision Tree": {
            "model": DecisionTreeRegressor(random_state=42),
            "params": {
                "model__max_depth": [None, 5, 10, 20],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 2, 4]
            }
        },

        "Random Forest": {
            "model": RandomForestRegressor(random_state=42),
            "params": {
                "model__n_estimators": [200, 400],
                "model__max_depth": [None, 10, 20],
                "model__min_samples_split": [2, 5],
                "model__min_samples_leaf": [1, 2],
                "model__max_features": ["sqrt", "log2"]
            }
        },

        "KNN": {
            "model": KNeighborsRegressor(),
            "params": {
                "model__n_neighbors": [3, 5, 7, 9],
                "model__weights": ["uniform", "distance"],
                "model__metric": ["euclidean", "manhattan"]
            }
        },

        "XGBoost": {
            "model": XGBRegressor(random_state=42),
            "params": {
                "model__n_estimators": [200, 400],
                "model__max_depth": [3, 5, 7],
                "model__learning_rate": [0.01, 0.05, 0.1],
                "model__subsample": [0.8, 1.0],
                "model__colsample_bytree": [0.8, 1.0]
            }
        }
    }



In [33]:
best_model = None
best_score = -np.inf
best_model_name = None

for name, config in models.items():
    print(f"\nTraining {name}...")

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", config["model"])
    ])

    grid = GridSearchCV(
        pipeline,
        config["params"],
        cv=5,
        scoring="accuracy" if problem_type=="classification" else "r2",
        n_jobs=-1
    )

    grid.fit(X_train, y_train)
    print("Best CV Score:", grid.best_score_)

    if grid.best_score_ > best_score:
        best_score = grid.best_score_
        best_model = grid.best_estimator_
        best_model_name = name



Training Logistic Regression...
Best CV Score: 0.62375

Training Decision Tree...
Best CV Score: 0.5912499999999999

Training Random Forest...
Best CV Score: 0.595

Training KNN...
Best CV Score: 0.56125

Training Naive Bayes...
Best CV Score: 0.575

Training XGBoost...
Best CV Score: 0.61375


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [34]:
y_pred = best_model.predict(X_test)

if problem_type == "classification":
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }
else:
    metrics = {
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R2 Score": r2_score(y_test, y_pred)
    }

print("Best Model:", best_model_name)
pd.DataFrame(metrics.items(), columns=["Metric", "Score"])


Best Model: Logistic Regression


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Metric,Score
0,Accuracy,0.665
1,Precision,0.0
2,Recall,0.0
3,F1 Score,0.0


In [35]:
joblib.dump(best_model, "best_model.pkl")


['best_model.pkl']

In [36]:
sample = {
    "PassengerId": 3001,
    "Pclass": 3,
    "Sex": "male",
    "Age": 32,
    "SibSp": 0,
    "Parch": 0,
    "Fare": 12.5,
    "Embarked": "S"
}

sample_df = pd.DataFrame([sample])
best_model.predict(sample_df)


array([0])