<a href="https://colab.research.google.com/github/sharna33/CSE4120_Data_Mining_Sessional/blob/main/CSE_4120_2003009_Lab3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform, jaccard
from scipy.stats import pearsonr

## **Load Dataset**

In [None]:
url = 'https://raw.githubusercontent.com/sharna33/CSE4120_Data_Mining_Sessional/refs/heads/main/Lab2_titanic.csv'
titanic = pd.read_csv(url)

print("Dataset shape:", titanic.shape)
print("\nFirst 5 rows:")
display(titanic.head())

Dataset shape: (891, 12)

First 5 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## **Handling Missing Values**

In [None]:
print("\nMissing values before handling:")
print(titanic.isnull().sum())

# Fill missing values
titanic['Age'].fillna(titanic['Age'].median(), inplace=True)
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)
titanic['Cabin'].fillna('Unknown', inplace=True)

# Verify missing values handled
print("\nMissing values after handling:")
print(titanic.isnull().sum())


Missing values before handling:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing values after handling:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['Age'].fillna(titanic['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediat

## **Categorical Data Encoding**

In [None]:
label_encoder = LabelEncoder()
titanic['Sex_encoded'] = label_encoder.fit_transform(titanic['Sex'])

# One-Hot Encoding
onehot_encoder = OneHotEncoder()
embarked_encoded = onehot_encoder.fit_transform(titanic[['Embarked']])
embarked_df = pd.DataFrame(embarked_encoded.toarray(),
                          columns=onehot_encoder.get_feature_names_out(['Embarked']))
titanic = pd.concat([titanic, embarked_df], axis=1)

print("\nEncoded columns:")
print(titanic[['Sex', 'Sex_encoded', 'Embarked', 'Embarked_C', 'Embarked_Q', 'Embarked_S']].head())


Encoded columns:
      Sex  Sex_encoded Embarked  Embarked_C  Embarked_Q  Embarked_S
0    male            1        S         0.0         0.0         1.0
1  female            0        C         1.0         0.0         0.0
2  female            0        S         0.0         0.0         1.0
3  female            0        S         0.0         0.0         1.0
4    male            1        S         0.0         0.0         1.0


## **Feature Scaling**

In [None]:
numerical_features = ['Age', 'Fare']
scaler_minmax = MinMaxScaler()
scaler_standard = StandardScaler()

# Apply scaling
titanic[['Age_minmax', 'Fare_minmax']] = scaler_minmax.fit_transform(titanic[['Age', 'Fare']])
titanic[['Age_zscore', 'Fare_zscore']] = scaler_standard.fit_transform(titanic[['Age', 'Fare']])

print("\nScaling results:")
display(titanic[['Age', 'Age_minmax', 'Age_zscore', 'Fare', 'Fare_minmax', 'Fare_zscore']].head())


Scaling results:


Unnamed: 0,Age,Age_minmax,Age_zscore,Fare,Fare_minmax,Fare_zscore
0,22.0,0.271174,-0.565736,7.25,0.014151,-0.502445
1,38.0,0.472229,0.663861,71.2833,0.139136,0.786845
2,26.0,0.321438,-0.258337,7.925,0.015469,-0.488854
3,35.0,0.434531,0.433312,53.1,0.103644,0.42073
4,35.0,0.434531,0.433312,8.05,0.015713,-0.486337


## **Similarity and Dissimilarity Measures**

In [None]:
sample = titanic[['Age', 'Fare']].sample(2, random_state=42)

# Pearson's Correlation
pearson_corr, _ = pearsonr(sample.iloc[0], sample.iloc[1])
print(f"\nPearson's Correlation: {pearson_corr:.4f}")

# Cosine Similarity
cosine_sim = cosine_similarity([sample.iloc[0]], [sample.iloc[1]])[0][0]
print(f"Cosine Similarity: {cosine_sim:.4f}")

# Jaccard Similarity (Convert to binary data)
binary_sample = sample.apply(lambda x: x > x.median(), axis=0)
jaccard_sim = 1 - jaccard(binary_sample.iloc[0], binary_sample.iloc[1])
print(f"Jaccard Similarity: {jaccard_sim:.4f}")

# Euclidean Distance
euclidean_dist = np.linalg.norm(sample.iloc[0] - sample.iloc[1])
print(f"Euclidean Distance: {euclidean_dist:.4f}")

# Display sample used
print("\nSample data used for similarity calculations:")
display(sample)


Pearson's Correlation: 1.0000
Cosine Similarity: 0.9852
Jaccard Similarity: 0.0000
Euclidean Distance: 5.6145

Sample data used for similarity calculations:


Unnamed: 0,Age,Fare
709,28.0,15.2458
439,31.0,10.5


In [None]:
sample = titanic[['Age', 'Fare']].sample(2, random_state=42)

# Extract rows as plain Python lists
x = sample.iloc[0].tolist()
y = sample.iloc[1].tolist()

# 1. Pearson's Correlation manually
mean_x = sum(x) / len(x)
mean_y = sum(y) / len(y)

# Compute covariance numerator and variances
cov_num = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
var_x = sum((xi - mean_x)**2 for xi in x)
var_y = sum((yi - mean_y)**2 for yi in y)

pearson_corr_manual = cov_num / np.sqrt(var_x * var_y)
print(f"Pearson's Correlation (manual): {pearson_corr_manual:.4f}")

# Cosine Similarity manually
dot_prod = sum(xi * yi for xi, yi in zip(x, y))
norm_x = np.sqrt(sum(xi**2 for xi in x))
norm_y = np.sqrt(sum(yi**2 for yi in y))

cosine_sim_manual = dot_prod / (norm_x * norm_y)
print(f"Cosine Similarity (manual): {cosine_sim_manual:.4f}")

# 3. Jaccard Similarity manually (binary threshold at median)
medians = sample.median()
x_bin = [1 if xi > medians[col] else 0 for xi, col in zip(x, medians.index)]
y_bin = [1 if yi > medians[col] else 0 for yi, col in zip(y, medians.index)]

intersection = sum(xb and yb for xb, yb in zip(x_bin, y_bin))
union = sum(xb or yb for xb, yb in zip(x_bin, y_bin))

jaccard_sim_manual = intersection / union if union != 0 else 0.0
print(f"Jaccard Similarity (manual): {jaccard_sim_manual:.4f}")

# 4. Euclidean Distance manually
euclidean_dist_manual = np.sqrt(sum((xi - yi)**2 for xi, yi in zip(x, y)))
print(f"Euclidean Distance (manual): {euclidean_dist_manual:.4f}")

print("\nSample data used for similarity calculations:")
print(sample)

Pearson's Correlation (manual): 1.0000
Cosine Similarity (manual): 0.9852
Jaccard Similarity (manual): 0.0000
Euclidean Distance (manual): 5.6145

Sample data used for similarity calculations:
      Age     Fare
709  28.0  15.2458
439  31.0  10.5000
