In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [6]:
df_train= pd.read_csv('Titanic-Dataset.csv')
df_test= pd.read_csv('test.csv')
df_survive_test= pd.read_csv("gender_submission.csv")

In [41]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Feature engineering

### Feature Selection: Dropping unnecessary columns


In [25]:
df_train= df_train.drop(columns=['Name', 'Ticket' , 'PassengerId'])
df_test= df_test.drop(columns=['Name', 'Ticket', 'PassengerId'])

In [26]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


### Feature construction: Merge SibSp and Parch

In [27]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1
df_train.drop(columns=['SibSp', 'Parch'], inplace=True)
df_test.drop(columns=['SibSp', 'Parch'], inplace=True)

In [28]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,FamilySize
0,0,3,male,22.0,7.25,,S,2
1,1,1,female,38.0,71.2833,C85,C,2
2,1,3,female,26.0,7.925,,S,1
3,1,1,female,35.0,53.1,C123,S,2
4,0,3,male,35.0,8.05,,S,1


In [None]:
# Transformer for Outlier Clipping
class OutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, factor=1.5):
        self.cols = cols
        self.factor = factor
        self.bounds = {}
    
    def fit(self, X, y=None):
        X_copy = X.copy()
        if self.cols is None:
            # Default: all numeric columns
            self.cols = X_copy.select_dtypes(include=['float64', 'int64']).columns.tolist()
        
        for col in self.cols:
            Q1 = X_copy[col].quantile(0.25)
            Q3 = X_copy[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - self.factor * IQR
            upper_bound = Q3 + self.factor * IQR
            self.bounds[col] = (lower_bound, upper_bound)
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for col, (lower, upper) in self.bounds.items():
            X_copy[col] = X_copy[col].clip(lower, upper)
        return X_copy


# Transformer for Missing Values
class Titatnic_Imputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.knn_imputer = KNNImputer(n_neighbors=5)
        self.mode_embarked = None
    
    def fit(self, X, y=None):
        # Store mode of Embarked
        self.mode_embarked = X['Embarked'].mode()[0]
        # Fit KNN imputer on numerical columns
        numerical_cols = ['Pclass', 'Age', 'FamilySize', 'Fare']
        self.knn_imputer.fit(X[numerical_cols])
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        # KNN imputation for Age
        numerical_cols = ['Pclass', 'Age', 'FamilySize', 'Fare']
        X_copy[numerical_cols] = self.knn_imputer.transform(X_copy[numerical_cols])
        # Mode imputation for Embarked
        X_copy['Embarked'].fillna(self.mode_embarked, inplace=True)
        # Drop Cabin
        X_copy.drop('Cabin', axis=1, inplace=True)
        return X_copy
# Transformer for Encoding
class FeatureEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.onehot_encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
    
    def fit(self, X, y=None):
        # Fit LabelEncoder for Sex
        self.label_encoder.fit(X['Sex'])
        # Fit OneHotEncoder for Embarked
        self.onehot_encoder.fit(X[['Embarked']])
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        # Label encode Sex
        X_copy['Sex'] = self.label_encoder.transform(X_copy['Sex'])
        # One-hot encode Embarked
        encoded_embarked = self.onehot_encoder.transform(X_copy[['Embarked']])
        embarked_columns = self.onehot_encoder.get_feature_names_out(['Embarked'])
        X_copy[embarked_columns] = encoded_embarked
        # Drop original Embarked
        X_copy.drop('Embarked', axis=1, inplace=True)
        return X_copy
# Transformer for Scaling
class FeatureScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()
    
    def fit(self, X, y=None):
        numerical_cols = ['Age', 'Fare', 'FamilySize']
        self.scaler.fit(X[numerical_cols])
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        numerical_cols = ['Age','Fare', 'FamilySize']
        X_copy[numerical_cols] = self.scaler.transform(X_copy[numerical_cols])
        return X_copy
    
pipeline = Pipeline([
    ('age_imputer', Titatnic_Imputer()),
    ('feature_encoder', FeatureEncoder()),
    ('outlier_clipper', OutlierClipper(cols=['Age', 'Fare'])),  # clip Age and Fare
    ('feature_scaler', FeatureScaler())
])

# Fit and transform the dataset
df_transformed = pipeline.fit_transform(df_train)
df_transformed_test = pipeline.transform(df_test)

# Save the cleaned dataset
df_transformed.to_csv('titanic_train_ready.csv', index=False)
df_transformed_test.to_csv('titanic_test_ready.csv', index= False)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_copy['Embarked'].fillna(self.mode_embarked, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_copy['Embarked'].fillna(self.mode_embarked, inplace=True)


In [2]:
train_csv= Path('titanic_train_ready.csv')
test_csv= Path('titanic_test_ready.csv')

train_processed = pd.read_csv(train_csv)
test_processed = pd.read_csv(test_csv)

In [3]:
train_processed.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,FamilySize,Embarked_Q,Embarked_S
0,0,3.0,1,-0.601585,-0.820552,0.05916,0.0,1.0
1,1,1.0,0,0.628073,2.031623,0.05916,0.0,0.0
2,1,3.0,0,-0.29417,-0.787578,-0.560975,0.0,1.0
3,1,1.0,0,0.397512,1.419297,0.05916,0.0,1.0
4,0,3.0,1,0.397512,-0.781471,-0.560975,0.0,1.0


In [4]:
test_processed.head()

Unnamed: 0,Pclass,Sex,Age,Fare,FamilySize,Embarked_Q,Embarked_S
0,3.0,1,0.359085,-0.792258,-0.560975,1.0,0.0
1,3.0,0,1.319755,-0.832765,0.05916,0.0,1.0
2,2.0,1,2.280425,-0.701476,-0.560975,1.0,0.0
3,3.0,1,-0.217317,-0.751549,-0.560975,0.0,1.0
4,3.0,0,-0.601585,-0.574462,0.679295,0.0,1.0


## Training 

In [7]:
X_train= train_processed.drop(columns='Survived')
y_train= train_processed['Survived']

X_test = test_processed
y_test = df_survive_test['Survived']

In [8]:
lr = LogisticRegression(max_iter=1000, random_state=42)
dt = DecisionTreeClassifier(random_state=42)


In [9]:
lr.fit(X_train, y_train)
dt.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [10]:
y_pred_lr =lr.predict(X_test)
y_pred_dt= dt.predict(X_test)

In [11]:
acc_lr= accuracy_score(y_test, y_pred_lr)
acc_dt= accuracy_score(y_test, y_pred_dt)

acc_lr, acc_dt

(0.9330143540669856, 0.8110047846889952)

# Results & Conclusion

## Preprocessing Steps
The following preprocessing techniques were applied before model training:
- **Missing Values:** KNN imputation  and Mode imputation
- **Categorical Variables:** One-Hot Encoding (OHE) and Label Encoding
- **Outliers:** Clipping  
- **Feature Scaling:** Min-Max Normalization  

## Results
- **Logistic Regression**
  - Achieved an accuracy of **93.3%**
  - Performs well because it:
    - Handles linear relationships effectively  
    - Avoids overfitting on small datasets  

- **Decision Tree**
  - Achieved an accuracy of **81.1%**
  - While flexible, it:
    - Tends to overfit small datasets like Titanic  
    - Needs pruning or regularization for better generalization  

### Model Comparison

| Model               | Accuracy | Strengths                          | Weaknesses                          |
|---------------------|----------|------------------------------------|-------------------------------------|
| Logistic Regression | 93.3%    | Stable, generalizes well, avoids overfitting | Limited to linear relationships     |
| Decision Tree       | 81.1%    | Flexible, interpretable            | Prone to overfitting without tuning |

## Conclusion
- **Logistic Regression** is more stable and generalizes better on this dataset.  
- **Decision Trees** underperform without hyperparameter tuning. They can improve with techniques such as:
  - Limiting maximum depth  
  - Using minimum samples per split/leaf  
  - Applying pruning or ensemble methods (e.g., Random Forest, Gradient Boosting)  
