#Titanic Dataset Preprocessing

In [97]:
#Data Collection
import pandas as pd
df = pd.read_csv("titanic_train.csv")
print(df.head(5))


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [98]:
#Handling null values
df.ffill(inplace=True) #Forward fill
df.bfill(inplace=True) #Backward fill
print(df.isnull().any())  #If any null values, returns True

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool


In [99]:
#Basic Data Exploration
print(df.index)
print(df.columns)
print(df.dtypes)

RangeIndex(start=0, stop=891, step=1)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


### Understanding each column

| Column       | Data Type             | Action                                     | Reason                                         |
|--------------|-----------------------|--------------------------------------------|------------------------------------------------|
| PassengerId  | int64                 | Drop                                       | Unique identifier, no predictive value          |
| Survived     | int64                 | Keep as target                             | Target variable (label)                          |
| Name         | object                | Drop (or extract titles before dropping)  | Raw name is not predictive, but titles may help|
| Sex          | object (categorical)  | Convert to binary numeric (0 for male, 1 for female) | Important categorical feature, avoid ordinal implication |
| Age          | float64               | Keep as is                                | Important numerical feature                      |
| SibSp        | int64                 | Keep as is, consider creating family size | Counts siblings/spouses aboard                    |
| Parch        | int64                 | Keep as is, consider creating family size | Counts parents/children aboard                    |
| Ticket       | object                | Extract prefix as categorical feature, then drop original | Raw ticket noisy; prefix may hold useful info    |
| Fare         | float64               | Keep as is, scale if necessary            | Reflects passenger class and socio-economic status |
| Cabin        | object                | Extract deck letter, impute missing values, encode | Deck location impacts survival                     |
| Embarked     | object                | Impute missing values, encode             | Port of embarkation affects survival             |
       |



In [100]:
#Handling different columns
#Drop ones that don't contribute
df.drop('PassengerId',axis=1,inplace=True)

In [101]:
#Encode nominal categorical values
df["Sex"]=df["Sex"].map({"male":0,"female":1})

In [102]:
#Extract useful prefixes
import re
df["Title"]=df["Name"].str.extract(r",\s*([^\.]+)\.")
df.drop("Name",axis=1,inplace=True)

df["Deck"] = df["Ticket"].str.extract(r"([A-Za-z\.]+)")
df["Deck"] = df["Deck"].fillna("NoPrefix")
df.drop("Ticket", axis=1, inplace=True)
#But too many prefixes for deck
threshold=10
prefix_counts = df["Deck"].value_counts()
rare_prefixes = prefix_counts[prefix_counts < threshold].index
df["Deck"] = df["Deck"].replace(rare_prefixes, 'Other')
# One-Hot Encode 'Deck' column, drop_first=True avoids dummy variable trap
df = pd.get_dummies(df, columns=["Deck"], drop_first=True)

In [103]:
#After data preprocessing
print(df.head(5))

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare Cabin Embarked Title  \
0         0       3    0  22.0      1      0   7.2500   C85        S    Mr   
1         1       1    1  38.0      1      0  71.2833   C85        C   Mrs   
2         1       3    1  26.0      0      0   7.9250   C85        S  Miss   
3         1       1    1  35.0      1      0  53.1000  C123        S   Mrs   
4         0       3    0  35.0      0      0   8.0500  C123        S    Mr   

   Deck_C.A.  Deck_NoPrefix  Deck_Other  Deck_PC  Deck_SC  Deck_SOTON  \
0      False          False       False    False    False       False   
1      False          False       False     True    False       False   
2      False          False       False    False    False       False   
3      False           True       False    False    False       False   
4      False           True       False    False    False       False   

   Deck_STON  
0      False  
1      False  
2       True  
3      False  
4      False  


In [104]:
#Data Splitting
from sklearn.model_selection import train_test_split
X=df.drop("Survived",axis=1)
y=df["Survived"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)
#stratify=y maintains the same class distribution in train and test sets.
#random_state=42 The number 42 is just a popular arbitrary choice among programmers