In [1]:
#Import all libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error




# Load and figure the dataset

In [2]:
#Load the data

df_test = pd.read_csv('test.csv')
df_train = pd.read_csv('train.csv')





In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Check for missing values



In [4]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
age_count = df_train['Age'].count()
cabin_count = df_train['Cabin'].count()
embarked_count = df_train['Embarked'].count()
survived_count = df_train['Survived'].count()

print('Age count:', age_count)
print('Cabin count:', cabin_count)
print('Embarked count:', embarked_count)
print('Survived count:', survived_count)


Age count: 714
Cabin count: 204
Embarked count: 889
Survived count: 891


In [6]:
#missing %%

age_missing = df_train['Age'].isna().sum() / age_count * 100
cabin_missing = df_train['Cabin'].isna().sum() / cabin_count * 100
embarked_missing = df_train['Embarked'].isna().sum() / embarked_count * 100
survived_missing = df_train['Survived'].isna().sum() / survived_count * 100

print('Age missing:', age_missing)
print('Cabin missing:', cabin_missing)
print('Embarked missing:', embarked_missing)
print('Survived missing:', survived_missing)


Age missing: 24.789915966386555
Cabin missing: 336.7647058823529
Embarked missing: 0.22497187851518563
Survived missing: 0.0


In [7]:
df_train['Embarked'] = df_train['Embarked'].fillna('Unknown')
df_train = df_train.drop('Cabin', axis=1)
df_train.groupby(['Pclass', 'Sex', 'Parch','SibSp'])['Age'].median()
df_train = df_train.drop(['Name', 'Ticket'], axis=1, errors='ignore')





In [8]:
df_test['Embarked'] = df_test['Embarked'].fillna('Unknown')
df_test = df_test.drop('Cabin', axis=1)
df_test.groupby(['Pclass', 'Sex', 'Parch', 'SibSp'])['Age'].median()
df_test = df_test.drop(['Name', 'Ticket'], axis=1, errors='ignore')

In [9]:


# Assuming df_train is your DataFrame

# Discretize the Fare column into bins
df_train['Fare_bin'] = pd.qcut(df_train['Fare'], 4)

# Define a function to fill missing values with the median of the group
def fill_age(row, grouped_medians):
    if pd.isnull(row['Age']):
        return grouped_medians.loc[row['Pclass'], row['Sex'], row['Parch'], row['SibSp'], row['Fare_bin']]
    else:
        return row['Age']

# Calculate the median age for each group
grouped_medians = df_train.groupby(['Pclass', 'Sex', 'Parch', 'SibSp', 'Fare_bin'])['Age'].median()

# Apply the function to fill missing values
df_train['Age'] = df_train.apply(lambda row: fill_age(row, grouped_medians), axis=1)

# Check for remaining missing values in Age
missing_age_count = df_train['Age'].isna().sum()
print(f"Remaining missing values in Age: {missing_age_count}")

# If there are still missing values, fill them with the overall median age
if missing_age_count > 0:
    overall_median_age = df_train['Age'].median()
    df_train['Age'].fillna(overall_median_age, inplace=True)

# Drop the Fare_bin column as it's no longer needed
df_train = df_train.drop('Fare_bin', axis=1)

# Print the DataFrame to verify the changes
print(df_train)

Remaining missing values in Age: 16
     PassengerId  Survived  Pclass     Sex   Age  SibSp  Parch     Fare  \
0              1         0       3    male  22.0      1      0   7.2500   
1              2         1       1  female  38.0      1      0  71.2833   
2              3         1       3  female  26.0      0      0   7.9250   
3              4         1       1  female  35.0      1      0  53.1000   
4              5         0       3    male  35.0      0      0   8.0500   
..           ...       ...     ...     ...   ...    ...    ...      ...   
886          887         0       2    male  27.0      0      0  13.0000   
887          888         1       1  female  19.0      0      0  30.0000   
888          889         0       3  female  27.5      1      2  23.4500   
889          890         1       1    male  26.0      0      0  30.0000   
890          891         0       3    male  32.0      0      0   7.7500   

    Embarked  
0          S  
1          C  
2          S  
3  

  grouped_medians = df_train.groupby(['Pclass', 'Sex', 'Parch', 'SibSp', 'Fare_bin'])['Age'].median()


In [None]:
import pandas as pd

# Assuming df_test is your DataFrame

# Discretize the Fare column into bins
df_test['Fare_bin'] = pd.qcut(df_test['Fare'], 4)

# Define a function to fill missing values with the median of the group
def fill_age(row, grouped_medians):
    if pd.isnull(row['Age']):
        return grouped_medians.loc[row['Pclass'], row['Sex'], row['Parch'], row['SibSp'], row['Fare_bin']]
    else:
        return row['Age']

# Calculate the median age for each group
grouped_medians = df_test.groupby(['Pclass', 'Sex', 'Parch', 'SibSp', 'Fare_bin'])['Age'].median()

# Apply the function to fill missing values
df_test['Age'] = df_test.apply(lambda row: fill_age(row, grouped_medians), axis=1)

# Check for remaining missing values in Age
missing_age_count = df_test['Age'].isna().sum()
print(f"Remaining missing values in Age: {missing_age_count}")

# If there are still missing values, fill them with the overall median age
if missing_age_count > 0:
    overall_median_age = df_test['Age'].median()
    df_test['Age'].fillna(overall_median_age, inplace=True)

# Drop the Fare_bin column as it's no longer needed
df_test = df_test.drop('Fare_bin', axis=1)

# Print the DataFrame to verify the changes
print(df_test)

Remaining missing values in Age: 9
     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Embarked  
0  

  grouped_medians = df_test.groupby(['Pclass', 'Sex', 'Parch', 'SibSp', 'Fare_bin'])['Age'].median()


In [13]:
df_train = df_train.drop('PassengerId', axis=1)
df_test = df_test.drop('PassengerId', axis=1)

In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [15]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [16]:


df_train = pd.get_dummies(df_train, dtype='int', drop_first=True)
df_test = pd.get_dummies(df_test, dtype='int', drop_first=True)



In [17]:
df_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Embarked_Unknown
0,0,3,22.0,1,0,7.25,1,0,1,0
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1,0
3,1,1,35.0,1,0,53.1,0,0,1,0
4,0,3,35.0,0,0,8.05,1,0,1,0


In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Define numerical columns
numerical_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

# Create a column transformer to scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # Keep other columns as is
)

# Create a pipeline with the preprocessor and logistic regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Split the data into features and target
X = df_train.drop('Survived', axis=1)
y = df_train['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the training and test data
train_accuracy = pipeline.score(X_train, y_train)
test_accuracy = pipeline.score(X_test, y_test)

print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")

Training accuracy: 0.8076
Test accuracy: 0.8156


In [20]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__penalty': ['l2'],  # Penalty type (L2 is supported by LogisticRegression with solver='lbfgs')
    'classifier__solver': ['lbfgs'],  # Solver for optimization
    'classifier__max_iter': [100, 200, 500]  # Maximum number of iterations
}

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # Use accuracy as the evaluation metric
    n_jobs=-1,  # Use all available processors
    verbose=1  # Print progress
)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print(f"Test accuracy with best parameters: {test_accuracy:.4f}")

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best parameters: {'classifier__C': 0.1, 'classifier__max_iter': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best cross-validation accuracy: 0.7949
Test accuracy with best parameters: 0.8268


In [21]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Replace LogisticRegression with XGBClassifier in the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Define the parameter grid for XGBoost
param_grid = {
    'classifier__n_estimators': [50, 100, 200],  # Number of trees
    'classifier__learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'classifier__max_depth': [3, 5, 7],  # Maximum depth of trees
    'classifier__subsample': [0.8, 1.0],  # Subsample ratio
    'classifier__colsample_bytree': [0.8, 1.0]  # Feature subsampling
}

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # Use accuracy as the evaluation metric
    n_jobs=-1,  # Use all available processors
    verbose=1  # Print progress
)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print(f"Test accuracy with best parameters: {test_accuracy:.4f}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best parameters: {'classifier__colsample_bytree': 0.8, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 50, 'classifier__subsample': 0.8}
Best cross-validation accuracy: 0.8384
Test accuracy with best parameters: 0.8045


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [22]:
from sklearn.metrics import classification_report

# Evaluate the best model on the test data
y_pred = best_model.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.90      0.84       105
           1       0.82      0.68      0.74        74

    accuracy                           0.80       179
   macro avg       0.81      0.79      0.79       179
weighted avg       0.81      0.80      0.80       179

