In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('titanic.csv')


In [2]:
# Display basic information about the dataset
print("Dataset information:")
print(df.info())


Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [3]:
# Handling missing values
print("\nHandling missing values:")
print("Number of missing values in each column:")
print(df.isnull().sum())



Handling missing values:
Number of missing values in each column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
# Drop columns with a high percentage of missing values or unnecessary for analysis
df = df.drop(['Cabin'], axis=1)
# Impute missing values for 'Age' with the mean
df['Age'].fillna(df['Age'].mean(), inplace=True)
# Impute missing values for 'Embarked' with the most common value
most_common_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(most_common_embarked, inplace=True)


In [5]:
# Drop rows with missing values in other columns
df = df.dropna()

In [6]:
# Display the updated information after handling missing values
print("\nUpdated dataset information:")
print(df.info())


Updated dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB
None


In [7]:
# Save the cleaned dataset to a new CSV file
df.to_csv('titanic_cleaned.csv', index=False)

print("\nData cleaning completed. Cleaned dataset saved to 'titanic_cleaned.csv'.")



Data cleaning completed. Cleaned dataset saved to 'titanic_cleaned.csv'.
