## Loading Required Libraries

In [42]:
import pandas as pd
import numpy as np

## Read the dataset 

In [43]:
df = pd.read_csv('../Datasets/titanic.csv')

## Explore the Dataset

In [44]:
# df.head() is a method used to view the first few rows of a DataFrame.
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


## Summary Statistics:

df.describe() is a method used to generate summary statistics of the numeric columns in a DataFrame. It provides an overview of key statistics such as count, mean, standard deviation, minimum, maximum, and percentiles.

In [45]:
print(df.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


## Checking and Handling Missing Values

In [46]:
print(df.isnull().sum())


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


## Filling missing values using Imputer from Scikit-learn

In [47]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
# https://scikit-learn.org/1.5/modules/generated/sklearn.impute.SimpleImputer.html


df['Age'] = imputer.fit_transform(df[['Age']])



## Dropping rows which have missing "Embarked"


In [48]:
df = df.dropna(subset=['Embarked'])

# Encoding Embarked Categorical variable using One-Hot Encoder

In [49]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
embarked_encoded = encoder.fit_transform(df[["Embarked"]])

encoded_df = pd.DataFrame(embarked_encoded.toarray(), columns=encoder.get_feature_names_out(["Embarked"]))

df = pd.concat([df.drop('Embarked', axis=1),encoded_df], axis=1)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1.0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,0.0,0.0,1.0
1,2.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,1.0,0.0,0.0
2,3.0,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,0.0,0.0,1.0
3,4.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,0.0,0.0,1.0
4,5.0,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,0.0,0.0,1.0


## Label encoding the Sex of the Passengers


In [50]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df['Sex'] = label_encoder.fit_transform(df['Sex'])

print(df)

     PassengerId  Survived  Pclass  \
0            1.0       0.0     3.0   
1            2.0       1.0     1.0   
2            3.0       1.0     3.0   
3            4.0       1.0     1.0   
4            5.0       0.0     3.0   
..           ...       ...     ...   
888        889.0       0.0     3.0   
889        890.0       1.0     1.0   
890        891.0       0.0     3.0   
61           NaN       NaN     NaN   
829          NaN       NaN     NaN   

                                                  Name  Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    1  22.0    1.0   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0    1.0   
2                               Heikkinen, Miss. Laina    0  26.0    0.0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0    1.0   
4                             Allen, Mr. William Henry    1  35.0    0.0   
..                                                 ...  ...   ...    ...   
888        

## Scaling Numerical Features

In [53]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df['Fare_Scaled'] = scaler.fit_transform(df[['Fare']])

print(df[['Fare', 'Fare_Scaled']].head())

      Fare  Fare_Scaled
0   7.2500     0.014151
1  71.2833     0.139136
2   7.9250     0.015469
3  53.1000     0.103644
4   8.0500     0.015713
