In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# 1. LOAD DATASET

path = "/home/sita/Downloads/titanic.csv"
df = pd.read_csv(path)

print("=== DATASET LOADED SUCCESSFULLY ===".center(80))
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
display(df.head())

# 2. DATA CLEANING

df_clean = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], errors='ignore')

print("\n=== MISSING VALUES BEFORE CLEANING ===".center(80))
print(df_clean.isnull().sum())

# Fill missing values
df_clean["Age"] = df_clean["Age"].fillna(df_clean["Age"].median())
df_clean["Embarked"] = df_clean["Embarked"].fillna(df_clean["Embarked"].mode()[0])

print("\n=== MISSING VALUES AFTER CLEANING ===".center(80))
print(df_clean.isnull().sum())


# 3. HANDLE OUTLIERS (IQR)

def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower, upper)
    return df

numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    df_clean = handle_outliers(df_clean, col)

print("\n=== OUTLIERS HANDLED (IQR METHOD) ===".center(80))
display(df_clean[numeric_cols].describe())


# 4. NORMALIZATION (0–1)

scaler = MinMaxScaler()
df_normalized = df_clean.copy()
df_normalized[numeric_cols] = scaler.fit_transform(df_clean[numeric_cols])

print("\n=== NORMALIZED NUMERICAL DATA (0–1 RANGE) ===".center(80))
display(df_normalized[numeric_cols].head())


# 5. ENCODING CATEGORICAL DATA

df_prepared = pd.get_dummies(df_normalized, columns=['Sex', 'Embarked'], drop_first=True)

print("\n=== DATA AFTER ONE-HOT ENCODING ===".center(80))
display(df_prepared.head())


# 6. FINAL CHECK

print("\n=== FINAL DATASET INFO ===".center(80))
print(df_prepared.info())

print("\n=== FINAL DATASET SUMMARY ===".center(80))
display(df_prepared.describe())


# 7. SAVE OUTPUT (OPTIONAL)

output_path = "/home/sita/Downloads/titanic_prepared.csv"
df_prepared.to_csv(output_path, index=False)

print(f"\nDataset cleaned & prepared saved to: {output_path}")


                      === DATASET LOADED SUCCESSFULLY ===                       
Rows: 418, Columns: 12


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


                    
=== MISSING VALUES BEFORE CLEANING ===                     
Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64
                     
=== MISSING VALUES AFTER CLEANING ===                     
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        1
Embarked    0
dtype: int64
                     
=== OUTLIERS HANDLED (IQR METHOD) ===                     


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,417.0
mean,0.363636,2.26555,29.355263,0.395933,0.0,24.677709
std,0.481622,0.841838,11.768812,0.637033,0.0,21.230978
min,0.0,1.0,3.875,0.0,0.0,0.0
25%,0.0,1.0,23.0,0.0,0.0,7.8958
50%,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1.0,3.0,35.75,1.0,0.0,31.5
max,1.0,3.0,54.875,2.5,0.0,66.9063


                 
=== NORMALIZED NUMERICAL DATA (0–1 RANGE) ===                 


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0.0,1.0,0.60049,0.0,0.0,0.117017
1,1.0,1.0,0.845588,0.4,0.0,0.104624
2,0.0,0.5,1.0,0.0,0.0,0.144792
3,0.0,1.0,0.453431,0.0,0.0,0.129472
4,1.0,1.0,0.355392,0.4,0.0,0.183652


                      
=== DATA AFTER ONE-HOT ENCODING ===                      


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,1.0,0.60049,0.0,0.0,0.117017,True,True,False
1,1.0,1.0,0.845588,0.4,0.0,0.104624,False,False,True
2,0.0,0.5,1.0,0.0,0.0,0.144792,True,True,False
3,0.0,1.0,0.453431,0.0,0.0,0.129472,True,False,True
4,1.0,1.0,0.355392,0.4,0.0,0.183652,False,False,True


                          
=== FINAL DATASET INFO ===                           
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    418 non-null    float64
 1   Pclass      418 non-null    float64
 2   Age         418 non-null    float64
 3   SibSp       418 non-null    float64
 4   Parch       418 non-null    float64
 5   Fare        417 non-null    float64
 6   Sex_male    418 non-null    bool   
 7   Embarked_Q  418 non-null    bool   
 8   Embarked_S  418 non-null    bool   
dtypes: bool(3), float64(6)
memory usage: 20.9 KB
None
                         
=== FINAL DATASET SUMMARY ===                         


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,417.0
mean,0.363636,0.632775,0.499613,0.158373,0.0,0.36884
std,0.481622,0.420919,0.230761,0.254813,0.0,0.317324
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.375,0.0,0.0,0.118013
50%,0.0,1.0,0.453431,0.0,0.0,0.216036
75%,1.0,1.0,0.625,0.4,0.0,0.470808
max,1.0,1.0,1.0,1.0,0.0,1.0



Dataset cleaned & prepared saved to: /home/sita/Downloads/titanic_prepared.csv
