In [14]:
!pip install pandas numpy seaborn scikit-learn



In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer

In [16]:
df = sns.load_dataset('titanic')

In [17]:
print("Basic Dataset Information:\n")
print(df.info())
print("\n First 5 Rows:\n", df.head())
print("\n Summary Statistics:\n", df.describe(include='all'))

Basic Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None

 First 5 Rows:
    survived  pclass     sex  

### 3. Handling Null/Missing Values

In [18]:
print("\n🔍 Columns with Missing Values:\n", df.isnull().sum())
df_cleaned = df.drop(columns=['deck'])
df_cleaned['age'].fillna(df['age'].median(), inplace=True)
df_cleaned['embarked'].fillna(df['embarked'].mode()[0], inplace=True)
df_cleaned.dropna(inplace=True)
print("\n✅ After Handling Missing Values:\n", df_cleaned.isnull().sum())


🔍 Columns with Missing Values:
 survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

✅ After Handling Missing Values:
 survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['embarked'].fillna(df['embarked'].mode()[0], inplace=True)


###  4. Handling Continuous Values 

In [19]:
print("\n📊 Handling Continuous Variables:")

scaler = MinMaxScaler()
df_cleaned[['age_norm', 'fare_norm']] = scaler.fit_transform(df_cleaned[['age', 'fare']])
print("\nNormalized Age and Fare (0 to 1):\n", df_cleaned[['age_norm', 'fare_norm']].head())

kbin = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
df_cleaned['age_binned'] = kbin.fit_transform(df_cleaned[['age']])
print("\nBinned Age Categories:\n", df_cleaned[['age', 'age_binned']].head())


📊 Handling Continuous Variables:

Normalized Age and Fare (0 to 1):
    age_norm  fare_norm
0  0.271174   0.014151
1  0.472229   0.139136
2  0.321438   0.015469
3  0.434531   0.103644
4  0.434531   0.015713

Binned Age Categories:
     age  age_binned
0  22.0         1.0
1  38.0         1.0
2  26.0         1.0
3  35.0         1.0
4  35.0         1.0


### 5. Final Processed Dataset

In [20]:
print("\n✅ Final Cleaned Data Sample:\n", df_cleaned.head())


✅ Final Cleaned Data Sample:
    survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male  embark_town alive  alone  age_norm  fare_norm  \
0    man        True  Southampton    no  False  0.271174   0.014151   
1  woman       False    Cherbourg   yes  False  0.472229   0.139136   
2  woman       False  Southampton   yes   True  0.321438   0.015469   
3  woman       False  Southampton   yes  False  0.434531   0.103644   
4    man        True  Southampton    no   True  0.434531   0.015713   

   age_binned  
0         1.0  
1         1.0  
2         1.0  
3         1.0  
4    