In [1]:
#Import all libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error




# Load and figure the dataset

In [2]:
#Load the data

df_test = pd.read_csv('test.csv')
df_train = pd.read_csv('train.csv')





In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Check for missing values



In [4]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
age_count = df_train['Age'].count()
cabin_count = df_train['Cabin'].count()
embarked_count = df_train['Embarked'].count()
survived_count = df_train['Survived'].count()

print('Age count:', age_count)
print('Cabin count:', cabin_count)
print('Embarked count:', embarked_count)
print('Survived count:', survived_count)


Age count: 714
Cabin count: 204
Embarked count: 889
Survived count: 891


In [6]:
#missing %%

age_missing = df_train['Age'].isna().sum() / age_count * 100
cabin_missing = df_train['Cabin'].isna().sum() / cabin_count * 100
embarked_missing = df_train['Embarked'].isna().sum() / embarked_count * 100
survived_missing = df_train['Survived'].isna().sum() / survived_count * 100

print('Age missing:', age_missing)
print('Cabin missing:', cabin_missing)
print('Embarked missing:', embarked_missing)
print('Survived missing:', survived_missing)


Age missing: 24.789915966386555
Cabin missing: 336.7647058823529
Embarked missing: 0.22497187851518563
Survived missing: 0.0


In [7]:
df_train['Embarked'] = df_train['Embarked'].fillna('Unknown')



In [8]:
df_train = df_train.drop('Cabin', axis=1)


In [9]:
df_train.groupby(['Pclass', 'Sex', 'Parch','SibSp'])['Age'].median()

Pclass  Sex     Parch  SibSp
1       female  0      0        32.5
                       1        35.0
                       2        53.0
                1      0        43.0
                       1        40.0
                                ... 
3       male    2      5        10.0
                       8         NaN
                3      1        16.0
                4      1        40.0
                5      1        39.0
Name: Age, Length: 74, dtype: float64

In [10]:
import pandas as pd

# Assuming df_train is your DataFrame

# Discretize the Fare column into bins
df_train['Fare_bin'] = pd.qcut(df_train['Fare'], 4)

# Define a function to fill missing values with the median of the group
def fill_age(row, grouped_medians):
    if pd.isnull(row['Age']):
        return grouped_medians.loc[row['Pclass'], row['Sex'], row['Parch'], row['SibSp'], row['Fare_bin']]
    else:
        return row['Age']

# Calculate the median age for each group
grouped_medians = df_train.groupby(['Pclass', 'Sex', 'Parch', 'SibSp', 'Fare_bin'])['Age'].median()

# Apply the function to fill missing values
df_train['Age'] = df_train.apply(lambda row: fill_age(row, grouped_medians), axis=1)

# Check for remaining missing values in Age
missing_age_count = df_train['Age'].isna().sum()
print(f"Remaining missing values in Age: {missing_age_count}")

# If there are still missing values, fill them with the overall median age
if missing_age_count > 0:
    overall_median_age = df_train['Age'].median()
    df_train['Age'].fillna(overall_median_age, inplace=True)

# Drop the Fare_bin column as it's no longer needed
df_train = df_train.drop('Fare_bin', axis=1)

# Print the DataFrame to verify the changes
print(df_train)

Remaining missing values in Age: 16
     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                               

  grouped_medians = df_train.groupby(['Pclass', 'Sex', 'Parch', 'SibSp', 'Fare_bin'])['Age'].median()


In [11]:
df_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [12]:
df_train = df_train.drop(['Name', 'Ticket'], axis=1, errors='ignore')


In [13]:
df_train.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


In [16]:
df_test['Embarked'] = df_test['Embarked'].fillna('Unknown')
df_test = df_test.drop('Cabin', axis=1)
df_test.groupby(['Pclass', 'Sex', 'Parch','SibSp'])['Age'].median()

Pclass  Sex     Parch  SibSp
1       female  0      0        36.0
                       1        37.0
                       2        57.0
                1      0        45.0
                       1        50.0
                                ... 
3       male    2      4         9.0
                       8        14.5
                5      1        40.0
                6      1        40.0
                9      1         NaN
Name: Age, Length: 61, dtype: float64

In [17]:
import pandas as pd

# Assuming df_test is your DataFrame

# Discretize the Fare column into bins
df_test['Fare_bin'] = pd.qcut(df_test['Fare'], 4)

# Define a function to fill missing values with the median of the group
def fill_age(row, grouped_medians):
    if pd.isnull(row['Age']):
        return grouped_medians.loc[row['Pclass'], row['Sex'], row['Parch'], row['SibSp'], row['Fare_bin']]
    else:
        return row['Age']

# Calculate the median age for each group
grouped_medians = df_test.groupby(['Pclass', 'Sex', 'Parch', 'SibSp', 'Fare_bin'])['Age'].median()

# Apply the function to fill missing values
df_test['Age'] = df_test.apply(lambda row: fill_age(row, grouped_medians), axis=1)

# Check for remaining missing values in Age
missing_age_count = df_test['Age'].isna().sum()
print(f"Remaining missing values in Age: {missing_age_count}")

# If there are still missing values, fill them with the overall median age
if missing_age_count > 0:
    overall_median_age = df_test['Age'].median()
    df_test['Age'].fillna(overall_median_age, inplace=True)

# Drop the Fare_bin column as it's no longer needed
df_test = df_test.drop('Fare_bin', axis=1)

# Print the DataFrame to verify the changes
print(df_test)

Remaining missing values in Age: 9
     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Embarked  
0  

  grouped_medians = df_test.groupby(['Pclass', 'Sex', 'Parch', 'SibSp', 'Fare_bin'])['Age'].median()


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define numerical and categorical columns
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']
categorical_features = ['Sex', 'Embarked']

# Create transformers for scaling and one-hot encoding
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the train set
X_train = df_train.drop(['Survived', 'PassengerId'], axis=1)
y_train = df_train['Survived']
X_train_transformed = pipeline.fit_transform(X_train)

# Transform the test set
X_test = df_test.drop(['PassengerId', 'Name', 'Ticket'], axis=1, errors='ignore')
X_test_transformed = pipeline.transform(X_test)