In [8]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv(r"D:\Downloads\tested.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [12]:
df.shape

(418, 12)

In [14]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [16]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [18]:
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]

print("Duplicates removed:", before - after)

Duplicates removed: 0


In [20]:
# Fill Age with median
if "Age" in df.columns:
    df["Age"] = df["Age"].fillna(df["Age"].median())

# Fill Fare with median
if "Fare" in df.columns:
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())

# Fill Embarked with mode
if "Embarked" in df.columns:
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

# Cabin: fill with "Unknown"
if "Cabin" in df.columns:
    df["Cabin"] = df["Cabin"].fillna("Unknown")

In [22]:
if "Sex" in df.columns:
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

In [24]:
if "Embarked" in df.columns:
    df["Embarked"] = df["Embarked"].astype("category")

In [26]:
if "SibSp" in df.columns and "Parch" in df.columns:
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

In [28]:
if "Age" in df.columns:
    df["AgeGroup"] = pd.cut(
        df["Age"],
        bins=[0,12,19,35,60,100],
        labels=["Child","Teen","YoungAdult","Adult","Senior"]
    )

In [30]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
FamilySize     0
AgeGroup       0
dtype: int64

In [32]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,AgeGroup
0,892,0,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,Unknown,Q,1,YoungAdult
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,Unknown,S,2,Adult
2,894,0,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,Unknown,Q,1,Senior
3,895,0,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,Unknown,S,1,YoungAdult
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,Unknown,S,3,YoungAdult


In [34]:
df.to_csv("cleaned_data.csv", index=False)
print("✅ cleaned_data.csv saved successfully!")

✅ cleaned_data.csv saved successfully!


# Cleaning Notes

1. Loaded Titanic dataset using pd.read_csv() from Downloads folder path.
2. Checked dataset structure using .shape, .head(), and .info().
3. Identified missing values using df.isnull().sum().
4. Removed duplicate rows using .drop_duplicates().
5. Filled missing Age values using median to avoid outlier effect.
6. Filled missing Fare values using median.
7. Filled Embarked missing values using mode.
8. Converted categorical column Sex into numeric values.
9. Created a new feature FamilySize.
10. Created AgeGroup column using binning for better analysis.