In [2]:
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/softwareWCU/Data-Preprocessing-for-ML-using-Titanic-Dataset/main/titanic2.csv"
df = pd.read_csv(url)
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",Female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print("shape:", df.shape)
display(df.head(8))
display(df.info())

shape: (909, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",Female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",MALE,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909 entries, 0 to 908
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  909 non-null    int64  
 1   Survived     909 non-null    int64  
 2   Pclass       909 non-null    object 
 3   Name         909 non-null    object 
 4   Sex          909 non-null    object 
 5   Age          710 non-null    float64
 6   SibSp        909 non-null    int64  
 7   Parch        909 non-null    int64  
 8   Ticket       909 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        208 non-null    object 
 11  Embarked     907 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 85.3+ KB


None

Explore missing values & basic stats


In [4]:
missing_count = df.isnull().sum()
missing_pct   = (df.isnull().mean() * 100).round(2)
pd.DataFrame({"missing_count": missing_count, "missing_pct": missing_pct}).sort_values("missing_pct", ascending=False)

Unnamed: 0,missing_count,missing_pct
Cabin,701,77.12
Age,199,21.89
Fare,18,1.98
Embarked,2,0.22
Name,0,0.0
Pclass,0,0.0
Survived,0,0.0
PassengerId,0,0.0
Parch,0,0.0
SibSp,0,0.0


Convert columns to the proper types (clean parsing)


In [5]:
# Convert 'Age' and 'Fare' columns to numbers (if not already numbers)
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')

# Convert 'Pclass' to numbers too
df['Pclass'] = pd.to_numeric(df['Pclass'], errors='coerce')

# Check the data types after conversion
print(df.dtypes)


PassengerId      int64
Survived         int64
Pclass         float64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


Handle missing values

In [6]:
# Check how many missing values in each column
print(df.isnull().sum())

# Example: Drop rows where 'Pclass' is missing (important column)
df = df.dropna(subset=['Pclass'])

# Fill missing Age with median
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)

# Fill missing Fare with median
median_fare = df['Fare'].median()
df['Fare'].fillna(median_fare, inplace=True)

# Fill missing Embarked with the most common value (mode)
most_common_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(most_common_embarked, inplace=True)

# Check again to make sure missing values are handled
print(df.isnull().sum())


PassengerId      0
Survived         0
Pclass          18
Name             0
Sex              0
Age            199
SibSp            0
Parch            0
Ticket           0
Fare            18
Cabin          701
Embarked         2
dtype: int64
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          685
Embarked         0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(median_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(median_fare, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

Handling Outliers


In [7]:

# A simple way to handle outliers is using the Interquartile Range (IQR) method.

# For Age column
Q1_age = df['Age'].quantile(0.25)
Q3_age = df['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age
lower_bound_age = Q1_age - 1.5 * IQR_age
upper_bound_age = Q3_age + 1.5 * IQR_age

# Keep only rows inside the range
df = df[(df['Age'] >= lower_bound_age) & (df['Age'] <= upper_bound_age)]

# For Fare column
Q1_fare = df['Fare'].quantile(0.25)
Q3_fare = df['Fare'].quantile(0.75)
IQR_fare = Q3_fare - Q1_fare
lower_bound_fare = Q1_fare - 1.5 * IQR_fare
upper_bound_fare = Q3_fare + 1.5 * IQR_fare

df = df[(df['Fare'] >= lower_bound_fare) & (df['Fare'] <= upper_bound_fare)]

print("✅ Outliers handled")


✅ Outliers handled


Handling Duplicate Values


In [8]:

print("Number of duplicate rows before:", df.duplicated().sum())

# Remove duplicate rows
df = df.drop_duplicates()

print("Number of duplicate rows after:", df.duplicated().sum())


Number of duplicate rows before: 9
Number of duplicate rows after: 0


Correcting Inconsistencies (Formatting)


In [9]:
# Convert Sex column to lowercase
df['Sex'] = df['Sex'].str.lower()

# Strip spaces and uppercase the first letter in Embarked column
df['Embarked'] = df['Embarked'].str.strip().str.upper()

print("✅ Inconsistencies corrected")

✅ Inconsistencies corrected


Correcting Inconsistencies (Data Value)


In [10]:


#  Example 1: Sex column might have inconsistent text like:
# 'male', 'Male ', ' FEMALE', 'FEMALE ', etc.
# Let's standardize them to only 'male' or 'female'.

df['Sex'] = df['Sex'].str.strip().str.lower()  # remove spaces and lowercase

# Replace any wrong values manually if found
df['Sex'] = df['Sex'].replace({
    'm': 'male',
    'f': 'female',
    'man': 'male',
    'woman': 'female'
})

#  Example 2: Embarked column — should only have 'C', 'Q', or 'S'
# If there are invalid entries, fix or set them to most common value
valid_embarked = ['C', 'Q', 'S']
most_common_embarked = df['Embarked'].mode()[0]

df['Embarked'] = df['Embarked'].apply(
    lambda x: x if x in valid_embarked else most_common_embarked
)

#  Example 3: Pclass should only have 1, 2, or 3
valid_pclass = [1, 2, 3]
df['Pclass'] = df['Pclass'].apply(
    lambda x: x if x in valid_pclass else 3  # if something else, set it to 3 (or drop)
)

print("✅ Data value inconsistencies corrected")


✅ Data value inconsistencies corrected


Final Check


In [11]:

print("Data types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())
print("\nData shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())


Data types:
 PassengerId      int64
Survived         int64
Pclass         float64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Missing values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          614
Embarked         0
dtype: int64

Data shape: (713, 12)

First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0     3.0   
2            3         1     3.0   
3            4         1     1.0   
4            5         0     3.0   
5            6         0     3.0   

                                           Name     Sex   Age  SibSp  Parch  \
0                       Braund, Mr. Owen Harris    male  22.0      1      0   
2                        

**Data Spliting**

Select the columns we want to use for prediction (features & target)

In machine learning:

X → Features (independent variables)

y → Target (what we want to predict)

For Titanic, we often use:

Pclass, Sex, Age, Fare, SibSp, Parch as features

Survived as target

In [12]:
# Convert Sex to numeric because ML algorithms work with numbers
df['Sex'] = df['Sex'].str.strip().str.lower().map({'male': 0, 'female': 1})

# Select features and target
X = df[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch']]   # features
y = df['Survived']                                          # target

print(X.head())
print(y.head())

   Pclass  Sex   Age     Fare  SibSp  Parch
0     3.0    0  22.0   7.2500      1      0
2     3.0    1  26.0   7.9250      0      0
3     1.0    1  35.0  53.1000      1      0
4     3.0    0  35.0   8.0500      0      0
5     3.0    0  28.0   8.4583      0      0
0    0
2    1
3    1
4    0
5    0
Name: Survived, dtype: int64


**Now X is our input data, and y is the output label.**

Split into Training and Testing sets

We use scikit-learn for splitting.

In [13]:
from sklearn.model_selection import train_test_split

# Split data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (570, 6)
Testing data shape: (143, 6)


train_test_split randomly splits the data.

test_size=0.2 means 20% of the data goes to testing.

random_state=42 is just to make the split the same each time (optional)

**Why this is important**

Training set → used to train our machine learning model.

Testing set → used to check how well our model works on new data.

This avoids overfitting and gives a fair evaluation of our model

In [14]:
print("X_train sample:")
print(X_train.head())

print("y_train sample:")
print(y_train.head())

print("X_test sample:")
print(X_test.head())

print("y_test sample:")
print(y_test.head())

X_train sample:
     Pclass  Sex   Age     Fare  SibSp  Parch
339     1.0    0  45.0  35.5000      0      0
70      2.0    0  32.0  10.5000      0      0
251     3.0    1  29.0  10.4625      1      1
723     2.0    0  50.0  13.0000      0      0
668     3.0    0  43.0   8.0500      0      0
y_train sample:
339    0
70     0
251    0
723    0
668    0
Name: Survived, dtype: int64
X_test sample:
     Pclass  Sex   Age    Fare  SibSp  Parch
149     2.0    0  42.0  13.000      0      0
418     2.0    0  30.0  13.000      0      0
49      3.0    1  18.0  17.800      1      0
374     3.0    1   3.0  21.075      3      1
828     3.0    0  28.0   7.750      0      0
y_test sample:
149    0
418    0
49     0
374    0
828    1
Name: Survived, dtype: int64


**Building a Simple Model (Beginner Friendly)**

**Step 1: Import the libraries**

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

**Step 2: Load the Titanic data again**

In [16]:
url = "https://raw.githubusercontent.com/softwareWCU/Data-Preprocessing-for-ML-using-Titanic-Dataset/main/titanic2.csv"
df = pd.read_csv(url)

# Clean the 'Sex' column and convert to numbers (0 = male, 1 = female)
df['Sex'] = df['Sex'].str.strip().str.lower().map({'male': 0, 'female': 1})

# Fill missing Age and Fare values with median (basic handling)
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

**Step 3: Select features (X) and target (y)**

In [17]:
X = df[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch']]
y = df['Survived']

**Step 4: Split the data (Train & Test)**

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (727, 6)
Testing data shape: (182, 6)


**Step 5: Build and train the model**

Clean and convert Pclass column

In [19]:
# Clean Pclass column
df['Pclass'] = df['Pclass'].astype(str).str.strip().str.replace('st', '', regex=False)
df['Pclass'] = df['Pclass'].str.replace('nd', '', regex=False)
df['Pclass'] = df['Pclass'].str.replace('rd', '', regex=False)
df['Pclass'] = df['Pclass'].astype(int)

**What this does:**

Converts values like '3rd ' or '1st ' to '3' or '1'

Removes the letters “st”, “nd”, “rd”

Turns the column into an integer type for ML.

**Feature Engineering**

Feature engineering means creating or transforming columns to help the model understand patterns better.

 Example features:


FamilySize = number of siblings/spouses + parents/children + 1 (the passenger themself)

IsAlone = whether the passenger is alone or not

Title = extracted from the passenger’s name (e.g., Mr, Mrs, Miss)

Step 1: Load Required  Libraries

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


**Step 2: Load and Clean the Titanic Data**

In [23]:
# Load dataset
url = "https://raw.githubusercontent.com/softwareWCU/Data-Preprocessing-for-ML-using-Titanic-Dataset/main/titanic2.csv"
df = pd.read_csv(url)

# Clean Pclass column
df['Pclass'] = df['Pclass'].astype(str).str.strip()
df['Pclass'] = df['Pclass'].str.replace('st', '', regex=False)
df['Pclass'] = df['Pclass'].str.replace('nd', '', regex=False)
df['Pclass'] = df['Pclass'].str.replace('rd', '', regex=False)
df['Pclass'] = df['Pclass'].astype(int)

# Clean Sex column (male=0, female=1)
df['Sex'] = df['Sex'].str.strip().str.lower().map({'male': 0, 'female': 1})

# Handle missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])


In [24]:
# Create FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Create IsAlone feature
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

# Extract Title from Name
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Simplify rare titles
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',
                                   'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')


  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


**Let’s check what titles we have:**

In [25]:
print(df['Title'].value_counts())


Title
Mr        525
Miss      194
Mrs       126
Master     41
Rare       23
Name: count, dtype: int64


**Step 4: Encode Categorical Features**

We must convert Title and Embarked into numbers so KNN can understand them.

In [26]:
label_encoder = LabelEncoder()
df['Title'] = label_encoder.fit_transform(df['Title'])
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])


**Step 5: Select Features and Target**

We’ll include our new features (FamilySize, IsAlone, Title).

In [27]:
X = df[['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'IsAlone', 'Title', 'Embarked']]
y = df['Survived']


**Step 6: Scale Features**

Scaling is essential for KNN because it uses distance between points.

In [28]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


**Step 7: Split the Data**

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


**Step 8: Train the KNN Model**

In [30]:
# Create KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# Train (fit) the model
knn.fit(X_train, y_train)

# Predict on test data
y_pred = knn.predict(X_test)


**Step 9: Evaluate the Model**

In [31]:
print(" Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


 Model Accuracy: 0.7527472527472527

Confusion Matrix:
[[81 18]
 [27 56]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.82      0.78        99
           1       0.76      0.67      0.71        83

    accuracy                           0.75       182
   macro avg       0.75      0.75      0.75       182
weighted avg       0.75      0.75      0.75       182

