In [1]:
# Cell 1 — imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Optional (for outlier detection)
from scipy import stats

# show plots inside the notebook
%matplotlib inline

print("pandas", pd.__version__, "numpy", np.__version__)


pandas 2.2.2 numpy 2.0.2


In [5]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
import pandas as pd

# adjust the path to where your file is in Drive
df = pd.read_csv("/content/drive/MyDrive/ColabProjects/Titanic-Dataset.csv")

df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
df.shape


(891, 12)

In [8]:
df.info()           # column types and non-null counts
df.shape            # rows, columns
df.isnull().sum()   # count missing values per column


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [9]:
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [10]:
print("Sex counts:\n", df['Sex'].value_counts(dropna=False))
print("\nEmbarked counts:\n", df['Embarked'].value_counts(dropna=False))


Sex counts:
 Sex
male      577
female    314
Name: count, dtype: int64

Embarked counts:
 Embarked
S      644
C      168
Q       77
NaN      2
Name: count, dtype: int64


In [12]:
# make a safe copy
df = df.copy()

# Normalize text fields to avoid capitalization/space issues
df['Sex'] = df['Sex'].astype(str).str.strip().str.lower()
df['Embarked'] = df['Embarked'].astype(str).str.strip().str.upper().replace({'nan': np.nan})


In [13]:
# 1) Fill missing Age with median
age_median = df['Age'].median()
df['Age_filled'] = df['Age'].fillna(age_median)

# 2) Fill missing Embarked with mode (most frequent)
emb_mode = df['Embarked'].mode(dropna=True)[0]
df['Embarked_filled'] = df['Embarked'].fillna(emb_mode)

# 3) Cabin — two options: extract deck letter or drop column
# Option A: extract deck letter (first char), else 'U' for unknown
df['Cabin_deck'] = df['Cabin'].dropna().astype(str).str[0]
df['Cabin_deck'] = df['Cabin_deck'].fillna('U')


In [14]:
# Family size & is_alone
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# Extract Title from Name (Mr, Mrs, Miss, Master, etc.)
def extract_title(name):
    import re
    m = re.search(r',\s*([^\.]+)\.', str(name))
    return m.group(1).strip() if m else 'Unknown'

df['Title'] = df['Name'].apply(extract_title)
# group rare titles
rare_titles = ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
df['Title'] = df['Title'].replace(rare_titles, 'Rare')
df['Title'] = df['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'})

# Age bins (optional)
df['AgeBin'] = pd.cut(df['Age_filled'], bins=[0,10,20,30,40,50,60,120],
                      labels=['0-10','11-20','21-30','31-40','41-50','51-60','60+'])


In [15]:
# Simple binary mapping for Sex
df['Sex_num'] = df['Sex'].map({'male':1, 'female':0})

# One-hot encoding for Embarked and Title and Cabin_deck
df = pd.get_dummies(df, columns=['Embarked_filled','Title','Cabin_deck','AgeBin'], drop_first=False)


In [16]:
# Example for Fare (you can do same for Age)
col = 'Fare'
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
print(f"{col} IQR bounds:", lower, upper)

# Flag outliers
df[f'{col}_is_outlier'] = ((df[col] < lower) | (df[col] > upper))

# Option: cap extreme values (winsorize)
df[f'{col}_capped'] = df[col].clip(lower=lower, upper=upper)


Fare IQR bounds: -26.724 65.6344


In [17]:
# Choose columns to scale
num_cols = ['Age_filled', 'Fare', 'FamilySize']  # add any numeric columns you want scaled

# Option A: StandardScaler (Z-score)
scaler = StandardScaler()
df[['Age_z','Fare_z','FamilySize_z']] = scaler.fit_transform(df[num_cols])

# Option B: MinMaxScaler (0-1)
mms = MinMaxScaler()
df[['Age_mm','Fare_mm','FamilySize_mm']] = mms.fit_transform(df[num_cols])


In [18]:
# Choose useful columns for modeling or analysis
keep_cols = [
    'PassengerId','Survived','Pclass','Sex_num','Age_filled','Fare',
    'FamilySize','IsAlone'
    # add any one-hot columns you created, e.g. 'Embarked_filled_C'...
]
# If you used get_dummies, you can show the columns containing Embarked:
[x for x in df.columns if 'Embarked_filled' in x][:10]  # view a few

# Create a final dataframe with needed columns
final_df = df.copy()  # or df[keep_cols + extra_dummy_cols]

# Save cleaned csv
final_df.to_csv("titanic_cleaned.csv", index=False)
print("Saved titanic_cleaned.csv, shape:", final_df.shape)


Saved titanic_cleaned.csv, shape: (891, 50)


In [20]:
# Survival rate by Sex
print("Survival by Sex_num:\n", df.groupby('Sex_num')['Survived'].mean())

# Survival rate by Pclass
print("\nSurvival by Pclass:\n", df.groupby('Pclass')['Survived'].mean())

# Survival by AgeBin (only if AgeBin exists)
if 'AgeBin' in df.columns:
    print("\nSurvival by Age group:\n")
    print(pd.crosstab(df['AgeBin'], df['Survived'], normalize='index'))
else:
    print("\nNo AgeBin column found. Run Cell 9 to create it.")


Survival by Sex_num:
 Sex_num
0    0.742038
1    0.188908
Name: Survived, dtype: float64

Survival by Pclass:
 Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

No AgeBin column found. Run Cell 9 to create it.


In [21]:
from google.colab import files
files.download('titanic_cleaned.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>