#Titanic Dataset Preprocessing

In [1]:
#Data Collection
import pandas as pd
df = pd.read_csv("titanic_train.csv")
print(df.head(5))


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [2]:
#Handling null values
df.ffill(inplace=True) #Forward fill
df.bfill(inplace=True) #Backward fill
print(df.isnull().any())  #If any null values, returns True

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool


In [3]:
#Basic Data Exploration
print(df.index)
print(df.columns)
print(df.dtypes)

RangeIndex(start=0, stop=891, step=1)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


### Understanding each column

| Column       | Data Type             | Action                                     | Reason                                         |
|--------------|-----------------------|--------------------------------------------|------------------------------------------------|
| PassengerId  | int64                 | Drop                                       | Unique identifier, no predictive value          |
| Survived     | int64                 | Keep as target                             | Target variable (label)                          |
| Name         | object                | Drop (or extract titles before dropping)  | Raw name is not predictive, but titles may help|
| Sex          | object (categorical)  | Convert to binary numeric (0 for male, 1 for female) | Important categorical feature, avoid ordinal implication |
| Age          | float64               | Keep as is                                | Important numerical feature                      |
| SibSp        | int64                 | Keep as is, consider creating family size | Counts siblings/spouses aboard                    |
| Parch        | int64                 | Keep as is, consider creating family size | Counts parents/children aboard                    |
| Ticket       | object                | Extract prefix as categorical feature, then drop original | Raw ticket noisy; prefix may hold useful info    |
| Fare         | float64               | Keep as is, scale if necessary            | Reflects passenger class and socio-economic status |
| Cabin        | object                | Extract deck letter, impute missing values, encode | Deck location impacts survival                     |
| Embarked     | object                | Impute missing values, encode             | Port of embarkation affects survival             |
       |



In [4]:
#Handling different columns
#Drop ones that don't contribute
df.drop('PassengerId',axis=1,inplace=True)

In [5]:
#Encode nominal categorical values
df["Sex"]=df["Sex"].map({"male":0,"female":1})

In [6]:
#Extract useful prefixes
import re
df["Title"]=df["Name"].str.extract(r",\s*([^\.]+)\.")
df.drop("Name",axis=1,inplace=True)

df["Deck"] = df["Ticket"].str.extract(r"([A-Za-z\.]+)")
df["Deck"] = df["Deck"].fillna("NoPrefix")
df.drop("Ticket", axis=1, inplace=True)
#But too many prefixes for deck
threshold=10
prefix_counts = df["Deck"].value_counts()
rare_prefixes = prefix_counts[prefix_counts < threshold].index
df["Deck"] = df["Deck"].replace(rare_prefixes, 'Other')
# One-Hot Encode 'Deck' column, drop_first=True avoids dummy variable trap
df = pd.get_dummies(df, columns=["Deck"], drop_first=True)

In [7]:
#After data preprocessing
print(df.head(5))

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare Cabin Embarked Title  \
0         0       3    0  22.0      1      0   7.2500   C85        S    Mr   
1         1       1    1  38.0      1      0  71.2833   C85        C   Mrs   
2         1       3    1  26.0      0      0   7.9250   C85        S  Miss   
3         1       1    1  35.0      1      0  53.1000  C123        S   Mrs   
4         0       3    0  35.0      0      0   8.0500  C123        S    Mr   

   Deck_C.A.  Deck_NoPrefix  Deck_Other  Deck_PC  Deck_SC  Deck_SOTON  \
0      False          False       False    False    False       False   
1      False          False       False     True    False       False   
2      False          False       False    False    False       False   
3      False           True       False    False    False       False   
4      False           True       False    False    False       False   

   Deck_STON  
0      False  
1      False  
2       True  
3      False  
4      False  


#Train and Test Data Splitting

In [8]:
#Data Splitting
from sklearn.model_selection import train_test_split
X=df.drop("Survived",axis=1)
y=df["Survived"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)
#stratify=y maintains the same class distribution in train and test sets.
#random_state=42 The number 42 is just a popular arbitrary choice among programmers

#Feature Selection




## Filter Methods for Feature Selection

### Basic Filter Methods
1. Constant Features Removal  
2. Quasi-Constant Features Removal  
3. Duplicate Features Removal  

### Statistical Filter Methods
1. Pearson Correlation  
2. Spearman Rank Correlation  
3. ANOVA F-test  
4. Mutual Information (regression)  
5. Chi-square Test  
6. Mutual Information (classification)  
7. Cramér’s V  


In [9]:
#Basic Filter methods
cat_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
X_train=pd.get_dummies(X_train,columns=cat_columns,drop_first=True)
X_train=X_train.astype(int)
X_test=pd.get_dummies(X_test,columns=cat_columns,drop_first=True)
X_test=X_test.astype(int)
print(df.dtypes)  # Verify all are numeric
#1.Removing Constant features
const = []
for features in X_train:
  if(X_train[features].std()==0):
    const.append(features)
print("Number of constant features:",len(const))
X_train.drop(labels=const,axis=1,inplace=True)
X_test.drop(labels=const,axis=1,inplace=True)

#2.Removing quasi constant features
quasi_constant = []
for feature in X_train.columns:
  predominant = (X_train[feature].value_counts()/float(len(X_train))).sort_values(ascending=False).values[0]
  if(predominant>0.999):
    quasi_constant.append(feature)
print("Number of quasi constant features:",len(quasi_constant))
X_train.drop(labels=quasi_constant,axis=1,inplace=True)
X_test.drop(labels=quasi_constant,axis=1,inplace=True)#Apply same removal to X_test

#3.Duplicated features
duplicates = []
for i in range(len(X_train.columns)):
  col1 = X_train.columns[i]
  for col2 in X_train.columns[i+1:]:
    if(X_train[col1].equals(X_train[col2])): #Not ==, as it won't return a single True of False
      duplicates.append(col2)
print("Number of duplicate features:",len(duplicates))
X_train.drop(labels=duplicates,axis=1,inplace=True)
X_test.drop(labels=duplicates,axis=1,inplace=True)#Apply same removal to X_test


Survived           int64
Pclass             int64
Sex                int64
Age              float64
SibSp              int64
Parch              int64
Fare             float64
Cabin             object
Embarked          object
Title             object
Deck_C.A.           bool
Deck_NoPrefix       bool
Deck_Other          bool
Deck_PC             bool
Deck_SC             bool
Deck_SOTON          bool
Deck_STON           bool
dtype: object
Number of constant features: 0
Number of quasi constant features: 0
Number of duplicate features: 0


In [10]:
#Statistical Filter Methods
#Even though we’ve already one-hot encoded everything, statistical filter methods still differ based on original feature type
#Target is Survived-Categorical, and this is a classification problem
#Numerical features
num_features=[col for col in X_train.columns if not set(X_train[col].unique()).issubset({0,1})]
cat_features = [col for col in X_train.columns if set(X_train[col].unique()).issubset({0,1})]
#Input numerical and output categorical --> Anova or Kendalls
from sklearn.feature_selection import f_classif,SelectKBest
#1. Anova
f_values,p_values = f_classif(X_train[num_features],y_train)
anova_df=pd.DataFrame({
    "Numerical Features":num_features,
    "F_values":f_values,
    "P_values":p_values
    })
anova_df.sort_values(by="P_values",inplace=True)
significant_numeric_features = anova_df[anova_df["P_values"]<0.05]["Numerical Features"].tolist()
print("Selected Numerical Features:", significant_numeric_features)
#Input categorical and output categroical --> Chi2 or Mutual Info
#2.chi2
from sklearn.feature_selection import chi2
chi2_values,p_values=chi2(X_train[cat_features],y_train)
chi2_df=pd.DataFrame({
    "Categorical Features":cat_features,
    "Chi2_values":chi2_values,
    "p_values":p_values
     })
chi2_df.sort_values(by="p_values",inplace=True)
significant_chi2_features=chi2_df[chi2_df["p_values"]<0.05]["Categorical Features"].tolist()
print("Selected Features from Chi2 test:", significant_chi2_features)
#3. Mutual info
from sklearn.feature_selection import mutual_info_classif,SelectKBest
#We explored manual filtering(MI>0) so far, another way is by selecting K best features
selector = SelectKBest(score_func=mutual_info_classif,k=10)
#Fit on our dataset
selector.fit(X_train[cat_features],y_train)
#We can use transform on selector to directly transform the dataset or can extract the features like we did before
significant_mi_features=X_train[cat_features].columns[selector.get_support()].tolist()
print("Selected Features from MI test:",significant_mi_features)

#Numerical features, Categorical Features(from both chi2 and mi) --> union all to filter from dataset
final_selected_features=list(set(significant_numeric_features+significant_chi2_features+significant_mi_features)) #Set conversion is for removing duplicates, converting to list is to use it as indexer
#Filter
X_train=X_train[final_selected_features]
# Ensure that X_test has the same columns as X_train after feature selection and encoding.
# Some dummy variables or features may be missing in X_test if certain categories
# are not present in the test set. To avoid errors during model prediction,
# we add these missing columns filled with zeros to X_test,
# then reorder columns to exactly match X_train.
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X_train.columns]

Selected Numerical Features: ['Pclass', 'Fare', 'Parch', 'Age']
Selected Features from Chi2 test: ['Sex', 'Title_Mr', 'Title_Mrs', 'Title_Miss', 'Deck_PC', 'Embarked_S', 'Cabin_A34', 'Cabin_D33', 'Cabin_C110', 'Title_Master']
Selected Features from MI test: ['Sex', 'Deck_SOTON', 'Cabin_B78', 'Cabin_C106', 'Cabin_C30', 'Cabin_C7', 'Cabin_E40', 'Title_Miss', 'Title_Mr', 'Title_Mrs']


In [11]:
#Dataset after data preprocessing and feature selection
print(X_train.columns)

Index(['Cabin_D33', 'Cabin_C106', 'Title_Miss', 'Cabin_A34', 'Parch', 'Sex',
       'Cabin_C110', 'Deck_PC', 'Cabin_C30', 'Embarked_S', 'Fare', 'Title_Mrs',
       'Title_Mr', 'Pclass', 'Cabin_E40', 'Age', 'Deck_SOTON', 'Cabin_B78',
       'Title_Master', 'Cabin_C7'],
      dtype='object')
