# NeoByte Task - 4




* For this task, I chose to go ahead with developing a **Classification Model** (Titanic Survival Prediction).
* The dataset that I chose was: **"Titanic: Machine Learning from Disaster"**




# Loading and checking Dataset

In [1]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")

# Display first few rows
print("Train Data:")
print(train_df.head())

print("\nTest Data:")
print(test_df.head())

# Check dataset info
print("\nTrain Data Info:")
print(train_df.info())

print("\nMissing Values in Train Data:")
print(train_df.isnull().sum())


Train Data:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN  

# Data Preprocessing

**Handling Missing Values**

In [2]:
# Handling missing values

# Filling missing Age values with the median
train_df["Age"].fillna(train_df["Age"].median(), inplace=True)
test_df["Age"].fillna(test_df["Age"].median(), inplace=True)

# Droping Cabin column (too many missing values)
#train_df.drop(columns=["Cabin"], inplace=True)
#test_df.drop(columns=["Cabin"], inplace=True)

# Filling missing cabin values with 'Unknown' and extract only the first letter (deck)
train_df["Cabin"] = train_df["Cabin"].fillna("Unknown").apply(lambda x: x[0])
test_df["Cabin"] = test_df["Cabin"].fillna("Unknown").apply(lambda x: x[0])

# Grouping less frequent decks as "Other"
deck_counts = train_df["Cabin"].value_counts()
rare_decks = deck_counts[deck_counts < 10].index  # Adjust threshold as needed
train_df["Cabin"] = train_df["Cabin"].apply(lambda x: "Other" if x in rare_decks else x)
test_df["Cabin"] = test_df["Cabin"].apply(lambda x: "Other" if x in rare_decks else x)

# Filling missing Embarked values with most frequent value (mode)
train_df["Embarked"].fillna(train_df["Embarked"].mode()[0], inplace=True)
test_df["Embarked"].fillna(train_df["Embarked"].mode()[0], inplace=True)

# Checking again for missing values
print("Missing Values after Cleaning:")
print(train_df.isnull().sum())


Missing Values after Cleaning:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df["Age"].fillna(train_df["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["Age"].fillna(test_df["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

**Encoding Categorical Features**

Machine Learning models work with numerical data, so we need to convert categorical values into numbers.

In [3]:
from sklearn.preprocessing import OneHotEncoder

# Converting "Sex" column to numerical (Male = 0, Female = 1)
train_df["Sex"] = train_df["Sex"].map({"male": 0, "female": 1})
test_df["Sex"] = test_df["Sex"].map({"male": 0, "female": 1})

# One-Hot Encode "Cabin"
train_df = pd.get_dummies(train_df, columns=["Cabin"], drop_first=True)
test_df = pd.get_dummies(test_df, columns=["Cabin"], drop_first=True)

# One-Hot Encoding for "Embarked" column
train_df = pd.get_dummies(train_df, columns=["Embarked"], drop_first=True)
test_df = pd.get_dummies(test_df, columns=["Embarked"], drop_first=True)

# Ensuring test set has the same feature columns as train set
train_columns = train_df.columns
test_df = test_df.reindex(columns=train_columns, fill_value=0)

# Checking for any remaining missing values
print("Missing Values in Train Set:\n", train_df.isnull().sum().sum())
print("Missing Values in Test Set:\n", test_df.isnull().sum().sum())

# Displaying first few rows after encoding
print(train_df.head())


Missing Values in Train Set:
 0
Missing Values in Test Set:
 1
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket     Fare  Cabin_B  Cabin_C  Cabin_D  Cabin_E  Cabin_F  \
0         A/5 21171   7.2500    False    False    False    False    False   
1          PC 17599  71.2833    False     True    False    False    False   
2 

**Fixing Missing Values in Test Set**

In [4]:
print(test_df.isnull().sum()[test_df.isnull().sum() > 0]) #checks which column has NaN value in test set

Fare    1
dtype: int64


In [6]:
test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True) #Fixing the missing value
print("Missing Values in Test Set:\n", test_df.isnull().sum().sum()) #again checking for missing val4ues. 

Missing Values in Test Set:
 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True) #Fixing the missing value


**Feature Selection & Scaling**

In [8]:
from sklearn.preprocessing import StandardScaler

# Selecting features for training
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked_Q", "Embarked_S"]

# Adding all Cabin-related features dynamically
cabin_features = [col for col in train_df.columns if col.startswith("Cabin_")]
features.extend(cabin_features)  

X_train = train_df[features]
y_train = train_df["Survived"]
X_test = test_df[features]

# Scaling "Age" and "Fare" using StandardScaler
scaler = StandardScaler()
X_train[["Age", "Fare"]] = scaler.fit_transform(X_train[["Age", "Fare"]])
X_test[["Age", "Fare"]] = scaler.transform(X_test[["Age", "Fare"]])  # Apply same scaling

# Display first few rows after scaling
print(X_train.head())


   Pclass  Sex       Age  SibSp  Parch      Fare  Embarked_Q  Embarked_S  \
0       3    0 -0.565736      1      0 -0.502445       False        True   
1       1    1  0.663861      1      0  0.786845       False       False   
2       3    1 -0.258337      0      0 -0.488854       False        True   
3       1    1  0.433312      1      0  0.420730       False        True   
4       3    0  0.433312      0      0 -0.486337       False        True   

   Cabin_B  Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_Other  Cabin_U  
0    False    False    False    False    False        False     True  
1    False     True    False    False    False        False    False  
2    False    False    False    False    False        False     True  
3    False     True    False    False    False        False    False  
4    False    False    False    False    False        False     True  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[["Age", "Fare"]] = scaler.fit_transform(X_train[["Age", "Fare"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[["Age", "Fare"]] = scaler.transform(X_test[["Age", "Fare"]])  # Apply same scaling


In [9]:
%who #checks all saved variables

print(train_df.dtypes)  # Check data types of all columns
print(train_df.head())


No variables match your requested type.
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin_B           bool
Cabin_C           bool
Cabin_D           bool
Cabin_E           bool
Cabin_F           bool
Cabin_Other       bool
Cabin_U           bool
Embarked_Q        bool
Embarked_S        bool
dtype: object
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. La

# Model Training

**Using Logistic Regression**

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Dropping "Name" and "Ticket" since they are non-numeric
train_df = train_df.drop(["Name", "Ticket"], axis=1)
test_df = test_df.drop(["Name", "Ticket"], axis=1)

# Define Features (X) & Target (y)
X = train_df.drop("Survived", axis=1)  # Features
y = train_df["Survived"]  # Target variable

# Split the Data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

#  Predict on Test Set
y_pred = model.predict(X_test)

#  Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Model Accuracy: 0.8045

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.84       105
           1       0.77      0.74      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.80      0.80      0.80       179



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Using Random Forest Classifier**

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Define Features and Target
X = train_df.drop(columns=["Survived", "PassengerId"])  # Drop target and ID
y = train_df["Survived"]

# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardizing "Age" and "Fare"
scaler = StandardScaler()
X_train[["Age", "Fare"]] = scaler.fit_transform(X_train[["Age", "Fare"]])
X_val[["Age", "Fare"]] = scaler.transform(X_val[["Age", "Fare"]])

# Initialize & Train Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=150,  # Increased trees for better learning
    max_depth=10,      # Limiting depth to prevent overfitting
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_val)

# Evaluation Metrics
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

# Feature Importance
importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances}).sort_values(by="Importance", ascending=False)

# Display Top 10 Features
print("\nTop 10 Important Features:\n", feature_importance_df.head(10))


Accuracy: 0.7988826815642458

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.89      0.84       110
           1       0.79      0.65      0.71        69

    accuracy                           0.80       179
   macro avg       0.80      0.77      0.78       179
weighted avg       0.80      0.80      0.79       179


Top 10 Important Features:
        Feature  Importance
1          Sex    0.339677
5         Fare    0.190863
2          Age    0.148483
0       Pclass    0.093968
12     Cabin_U    0.064622
3        SibSp    0.043961
4        Parch    0.037046
14  Embarked_S    0.026563
9      Cabin_E    0.011781
8      Cabin_D    0.010924


**Tried to fine-tune random forest classifier**

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Define Features and Target
X = train_df.drop(columns=["Survived", "PassengerId"])  # Drop target and ID
y = train_df["Survived"]

# Train-Validation Split (70-30 for better generalization)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardizing "Age" and "Fare"
scaler = StandardScaler()
X_train[["Age", "Fare"]] = scaler.fit_transform(X_train[["Age", "Fare"]])
X_val[["Age", "Fare"]] = scaler.transform(X_val[["Age", "Fare"]])

# Expanded Hyperparameter Grid for Random Forest
param_grid = {
    "n_estimators": [100, 300, 500, 700],         # More trees
    "max_depth": [10, 20, 30, None],              # Allow deeper trees
    "min_samples_split": [2, 5, 10, 15],          # Tuning split size
    "min_samples_leaf": [1, 2, 4, 6],             # Leaf size variations
    "max_features": ["sqrt", "log2", None],       # Feature selection for splits
    "bootstrap": [True, False]                    # Bootstrapping on/off
}

# Initialize Random Forest Model
rf_model = RandomForestClassifier(random_state=42)

# Randomized Search for Best Hyperparameters
rf_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=30,  # Increased search space
    cv=5,       # 5-Fold Cross-Validation
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit RandomizedSearchCV
rf_search.fit(X_train, y_train)

# Best Model from Search
best_rf_model = rf_search.best_estimator_

# Predictions
y_pred = best_rf_model.predict(X_val)

# Evaluation Metrics
print("\nBest Parameters:", rf_search.best_params_)
print("\nAccuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

# Feature Importance
importances = best_rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances}).sort_values(by="Importance", ascending=False)

# Display Top 10 Features
print("\nTop 10 Important Features:\n", feature_importance_df.head(10))


Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best Parameters: {'n_estimators': 500, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}

Accuracy: 0.7947761194029851

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.84       165
           1       0.77      0.66      0.71       103

    accuracy                           0.79       268
   macro avg       0.79      0.77      0.78       268
weighted avg       0.79      0.79      0.79       268


Top 10 Important Features:
        Feature  Importance
1          Sex    0.388453
5         Fare    0.160095
2          Age    0.124298
0       Pclass    0.092662
12     Cabin_U    0.080392
4        Parch    0.039977
3        SibSp    0.035025
14  Embarked_S    0.022977
8      Cabin_D    0.012412
9      Cabin_E    0.010837


**Using XGBoost**

In [13]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Define Features and Target
X = train_df.drop(columns=["Survived", "PassengerId"])  # Drop target and ID
y = train_df["Survived"]

# Train-Validation Split (70-30 for better generalization)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardizing "Age" and "Fare"
scaler = StandardScaler()
X_train[["Age", "Fare"]] = scaler.fit_transform(X_train[["Age", "Fare"]])
X_val[["Age", "Fare"]] = scaler.transform(X_val[["Age", "Fare"]])

# Define XGBoost Classifier
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",  # Binary classification
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42
)

# Hyperparameter Grid
param_grid = {
    "n_estimators": [100, 300, 500, 700],   # More boosting rounds
    "learning_rate": [0.01, 0.05, 0.1, 0.2],  # Step size
    "max_depth": [3, 5, 7, 10],              # Tree depth
    "min_child_weight": [1, 3, 5],           # Minimum child weight
    "gamma": [0, 0.1, 0.2],                  # Minimum loss reduction
    "subsample": [0.7, 0.8, 0.9],            # % of data used for training
    "colsample_bytree": [0.7, 0.8, 1.0],     # Features used per tree
    "lambda": [0, 0.1, 1.0],                 # L2 regularization
    "alpha": [0, 0.1, 1.0]                   # L1 regularization
}

# Randomized Search for Best Hyperparameters
xgb_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=30,  # Increased search space
    cv=5,       # 5-Fold Cross-Validation
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Train XGBoost
xgb_search.fit(X_train, y_train)

# Best Model
best_xgb_model = xgb_search.best_estimator_

# Predictions
y_pred = best_xgb_model.predict(X_val)

# Evaluation Metrics
print("\nBest Parameters:", xgb_search.best_params_)
print("\nAccuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))


Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best Parameters: {'subsample': 0.7, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'lambda': 0, 'gamma': 0.1, 'colsample_bytree': 0.7, 'alpha': 0.1}

Accuracy: 0.8171641791044776

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.90      0.86       165
           1       0.81      0.68      0.74       103

    accuracy                           0.82       268
   macro avg       0.82      0.79      0.80       268
weighted avg       0.82      0.82      0.81       268



# Saving the trained model with model versioning

In [16]:
import os
import pickle
import re

# Find existing model versions in the directory
existing_models = [f for f in os.listdir() if re.match(r"xgb_model_v\d+\.pkl", f)]

if existing_models:
    # Extract version numbers and find the latest
    latest_version = max([int(re.search(r"v(\d+)", f).group(1)) for f in existing_models])
    new_version = latest_version + 1
else:
    new_version = 1  # First version

# Define filename with versioning
filename = f"xgb_model_v{new_version}.pkl"

# Save the model with the new version
with open(filename, "wb") as file:
    pickle.dump(best_xgb_model, file)

print(f"Model saved successfully as '{filename}'")



Model saved successfully as 'xgb_model_v1.pkl'
