In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
import pandas as pd
import numpy as np


In [38]:
#loading titanic data set
data = pd.read_csv("/content/drive/MyDrive/Dataset/Titanic-Dataset.csv")

In [39]:
#drop all categorical columns except survived
categorical_column = data.select_dtypes(include = ["object"]).columns
# print(categorical_column)
data = data.drop(columns=[col for col in categorical_column if col != "Survived"])
print(data)

     PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare
0              1         0       3  22.0      1      0   7.2500
1              2         1       1  38.0      1      0  71.2833
2              3         1       3  26.0      0      0   7.9250
3              4         1       1  35.0      1      0  53.1000
4              5         0       3  35.0      0      0   8.0500
..           ...       ...     ...   ...    ...    ...      ...
886          887         0       2  27.0      0      0  13.0000
887          888         1       1  19.0      0      0  30.0000
888          889         0       3   NaN      1      2  23.4500
889          890         1       1  26.0      0      0  30.0000
890          891         0       3  32.0      0      0   7.7500

[891 rows x 7 columns]


In [40]:
#check for missing values
missing_info = data.isnull().sum() /len(data) *100
print(missing_info)

PassengerId     0.00000
Survived        0.00000
Pclass          0.00000
Age            19.86532
SibSp           0.00000
Parch           0.00000
Fare            0.00000
dtype: float64


In [41]:
#handeling missing values
for column in data.columns:
  if missing_info[column] > 10:
    data[column].fillna(data[column].mean, inplace=True)
  else:
    data.dropna(subset=[column],inplace=True)

1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64>' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  data[column].fillna(data[column].mean, inplace=True)


In [42]:
#displayed cleaned data
print("Data after processing:\n",data.head())
print("\nMissing values after processing:\n",data.isnull().sum())

Data after processing:
    PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare
0            1         0       3  22.0      1      0   7.2500
1            2         1       1  38.0      1      0  71.2833
2            3         1       3  26.0      0      0   7.9250
3            4         1       1  35.0      1      0  53.1000
4            5         0       3  35.0      0      0   8.0500

Missing values after processing:
 PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
dtype: int64


In [43]:
#seperate features X and Y
X = data.drop(columns=["Survived"]).values
Y=data["Survived"].values

In [44]:
#define a function for train-test split from scratch
def train_test_split_scratch(X,Y, test_size=0.3,random_seed=42):
  np.random.seed(random_seed)
  indices = np.arange(X.shape[0])
  np.random.shuffle(indices)
  test_split_size = int(len(X)*test_size)
  test_indices = indices[:test_split_size]
  train_indices = indices[test_split_size:]
  X_train, X_test = X[train_indices], X[test_indices]
  Y_train, Y_test = Y[train_indices], Y[test_indices]
  return X_train,X_test,Y_train,Y_test

In [45]:
X_train, X_test, Y_train, Y_test = train_test_split_scratch(X,Y, test_size=0.3)


In [46]:
print("Shape of X_train:",X_train.shape)
print("Shape of Y_train:",Y_train.shape)
print("Shape of X_test:",X_test.shape)
print("Shape of Y_test:",Y_test.shape)

Shape of X_train: (624, 6)
Shape of Y_train: (624,)
Shape of X_test: (267, 6)
Shape of Y_test: (267,)


In [47]:
def euclidean_distance(point1,point2):
  if point1.shape != point2.shape:
    raise ValueError("Point must have ethe same dimensions to calculate Euclidean distance.")
  distance = np.sqrt(np.sum((point1-point2) ** 2))
  return distance

In [48]:
try:
  point1 = np.array([3,4])
  point2 = np.array([0,0])
  result = euclidean_distance(point1, point2)
  expected_result = 5.0
  assert np.isclose(result,expected_result),f"Expected {expected_result}, but got {result} "
  print("test passed sucessfully")
except ValueError as ve:
  print(f"ValueError: {ve}")
except  AssertionError as ae:
  print(f"AssertionError: {ae}")
except Exception as e:
  print(f"Exception: {e}")


test passed sucessfully


In [49]:
def knn_predict_single(query, X_train, Y_train, k=3):
  distance = [euclidean_distance (query,x) for x in X_train]
  sorted_indices = np.argsort(distance)
  nearest_indices=sorted_indices[:k]
  nearest_labels = Y_train[nearest_indices]
  prediction = np.bincount(nearest_labels).argmax()
  return prediction

In [50]:
def knn_predict(X_test, X_train, Y_train, k =3):
  prediction = [knn_predict_single(x,X_train,Y_train,k) for x in X_test]
  return np.array(prediction)

In [51]:
try:
# Define the test set for the test case
  X_test_sample = X_test[:5] # Taking a small subset for testing
  y_test_sample = Y_test[:5] # Corresponding labels for the subset
  # Make predictions
  predictions = knn_predict(X_test_sample, X_train, Y_train, k=3)
  # Print test results
  print("Predictions:", predictions)
  print("Actual labels:", y_test_sample)
  # Check if predictions match expected format
  assert predictions.shape == y_test_sample.shape, "The shape of predictions does not match the shape of the actual labels."
  print("Test case passed successfully!")
except AssertionError as ae:
  print(f"AssertionError: {ae}")
except Exception as e:
  print(f"An unexpected error occurred: {e}")


An unexpected error occurred: unsupported operand type(s) for -: 'method' and 'float'


In [52]:
def compute_accuracy(y_true, y_pred):

  correct_predictions = np.sum(y_true == y_pred)
  total_predictions = len(y_true)
  accuracy = (correct_predictions / total_predictions) * 100
  return accuracy

In [53]:
try:
  # Make predictions on the entire test set
  predictions = knn_predict(X_test, X_train, Y_train, k=3)
  # Compute the accuracy
  accuracy = compute_accuracy(Y_test, predictions)
  # Print the accuracy
  print(f"Accuracy of the KNN model on the test set: {accuracy:.2f}%")
except Exception as e:
  print(f"An unexpected error occurred during prediction or accuracy computation: {e}")

An unexpected error occurred during prediction or accuracy computation: unsupported operand type(s) for -: 'method' and 'float'


In [54]:
import matplotlib.pyplot as plt
def experiment_knn_k_values(X_train, y_train, X_test, y_test, k_values):
  accuracies = {}
  for k in k_values:
    # Make predictions using the current value of k
    predictions = knn_predict(X_test, X_train, Y_train, k=k)
    # Compute the accuracy
    accuracy = compute_accuracy(y_test, predictions)
    accuracies[k] = accuracy
    print(f"Accuracy for k={k}: {accuracy:.2f}%")
    # Plot the accuracies
  plt.figure(figsize=(10, 5))
  plt.plot(k_values, list(accuracies.values()), marker="o")
  plt.xlabel("k (Number of Neighbors)")
  plt.ylabel("Accuracy (%)")
  plt.title("Accuracy of KNN with Different Values of k")
  plt.grid(True)
  plt.show()
  return accuracies

In [55]:
k_values = range(1,20) # You can adjust this range as needed
# Run the experiment
try:
  accuracies = experiment_knn_k_values(X_train, Y_train, X_test, Y_test, k_values)
  print("Experiment completed. Check the plot for the accuracy trend.")
except Exception as e:
  print(f"An unexpected error occurred during the experiment: {e}")

An unexpected error occurred during the experiment: unsupported operand type(s) for -: 'method' and 'float'
