In [1]:
pip install pandas numpy

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [2]:
import pandas as pd
import numpy as np

In [8]:
# Load the dataset
df = pd.read_csv("C:/Users/Admin/OneDrive/Documents/Titanic-Dataset.csv")

In [10]:
# Display first few rows
print("Initial Data Snapshot:")
print(df.head())

Initial Data Snapshot:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0

In [11]:
# 1. Check for missing values
# -----------------------------
print("\nMissing Values:")
print(df.isnull().sum())



Missing Values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [12]:
# 2. Fill missing values
# -----------------------------
# Fill 'Age' with mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)


In [13]:
# Fill 'Embarked' with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [14]:
# Drop 'Cabin' due to too many missing values
df.drop(columns=['Cabin'], inplace=True)


In [15]:
# 3. Remove duplicates
# -----------------------------
df.drop_duplicates(inplace=True)

In [16]:
# 4. Rename columns
# -----------------------------
df.rename(columns={'Fare': 'TicketFare'}, inplace=True)


In [17]:
# 5. Convert datatypes (if needed)
# -----------------------------
df['Pclass'] = df['Pclass'].astype(str)  # convert to string


In [18]:
# 6. Feature encoding
# -----------------------------
# Convert 'Sex' and 'Embarked' to numerical
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['Embarked'])

In [19]:
# 7. Final cleaned data
# -----------------------------
print("\nCleaned Data Snapshot:")
print(df.head())


Cleaned Data Snapshot:
   PassengerId  Survived Pclass  \
0            1         0      3   
1            2         1      1   
2            3         1      3   
3            4         1      1   
4            5         0      3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket  TicketFare  Embarked_C  Embarked_Q  Embarked_S  
0         A/5 21171      7.2500       False       False        True  
1          PC 17599     71.2833        True       False       False  
2  STON/O2. 3101282      7.9250       False       False        True 

In [20]:
# Save cleaned dataset
df.to_csv("Titanic-Dataset_cleaned.csv", index=False)


In [21]:
print("\nCleaned Data Preview:\n", df.head())
print("\nCleaned Data Summary:\n", df.info())


Cleaned Data Preview:
    PassengerId  Survived Pclass  \
0            1         0      3   
1            2         1      1   
2            3         1      3   
3            4         1      1   
4            5         0      3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket  TicketFare  Embarked_C  Embarked_Q  Embarked_S  
0         A/5 21171      7.2500       False       False        True  
1          PC 17599     71.2833        True       False       False  
2  STON/O2. 3101282      7.9250       False       False        True 