In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("TitanicKNN").getOrCreate()

# Load data
train_data = spark.read.csv("/Users/saimilind/Desktop/MLAssignments/titanicdataset.csv", header=True, inferSchema=True)

train_data.printSchema()
train_data.show(5)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.datasets import fetch_openml

# Load the Titanic dataset
titanic = fetch_openml(name='titanic', version=1, as_frame=True)
df = titanic.frame
df

In [None]:

# Drop rows with missing target values
df = df.dropna(subset=["survived"])

# Select relevant features and label
feature_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare']
X = df[feature_cols]
y = df['survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

# Write standard scaler from scratch
def custom_standard_scaler(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    scaled_X = (X - mean) / std
    return scaled_X

# Scale the data using the custom standard scaler
X_train_scaled = custom_standard_scaler(X_train)
X_test_scaled = custom_standard_scaler(X_test)


In [None]:
# Determine the K value and create a visualization of the accuracy
k_values = range(1, 21)
accuracy_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring='accuracy')
    accuracy_scores.append(np.mean(scores))

# Report the best K value
best_k = k_values[np.argmax(accuracy_scores)]
print(f"The best K value is: {best_k}")

In [None]:
# Create a visualization of the accuracy
plt.plot(k_values, accuracy_scores, marker='o')
plt.xlabel('K Value')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. K Value')
plt.show()

In [None]:

import pandas as pd

# Assuming df is your DataFrame
columns_to_fill = ['age', 'fare']

# Replace null values with mean for specific columns
X_train_scaled[columns_to_fill] = X_train_scaled[columns_to_fill].fillna(X_train_scaled[columns_to_fill].mean())

# Display the DataFrame with null values replaced by mean in specified columns
print(X_train_scaled)


X_train_scaled


In [None]:
# Train the final model with the best K value
final_knn_model = KNeighborsClassifier(n_neighbors=best_k)
final_knn_model.fit(X_train_scaled, y_train)

In [None]:
# Assuming df is your DataFrame
columns_to_fill = ['age', 'fare']

# Replace null values with mean for specific columns
X_test_scaled[columns_to_fill] = X_test_scaled[columns_to_fill].fillna(X_test_scaled[columns_to_fill].mean())

# Display the DataFrame with null values replaced by mean in specified columns
print(X_test_scaled)


X_test_scaled


In [None]:
# Evaluate using confusion matrix on the test set
y_pred = final_knn_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)