In [90]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [91]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [92]:
total_NaN=data.isna().sum().sum()
total_NaN

866

In [93]:
percent_NaN = (total_NaN/(data.shape[0]*data.shape[1]))*100
percent_NaN

8.099513655069211

In [94]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [95]:
data = data.dropna()
data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [96]:
X = pd.DataFrame(data, columns=('Age', 'Fare', 'SibSp', 'Parch'))
y = pd.Series(data['Fare'])
print(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

      Age     Fare  SibSp  Parch
1    38.0  71.2833      1      0
3    35.0  53.1000      1      0
6    54.0  51.8625      0      0
10    4.0  16.7000      1      1
11   58.0  26.5500      0      0
..    ...      ...    ...    ...
871  47.0  52.5542      1      1
872  33.0   5.0000      0      0
879  56.0  83.1583      0      1
887  19.0  30.0000      0      0
889  26.0  30.0000      0      0

[183 rows x 4 columns]


In [97]:
#=====================================
# K Nearest Neighbour Without MSE
#=====================================

KNN = KNeighborsRegressor()
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
mse_lr = mse(y_test, y_pred)
print(f"MSE for K Nearest Neighbour: {mse_lr:.4f}")

MSE for K Nearest Neighbour: 44.5402


In [98]:
#====================================
# Apply different scaling techniques and compare
#====================================

scalers = {
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

for scaler_name, scaler in scalers.items():
    # Apply scaler to both training and test sets
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # k-NN with scaling
    model_knn_scaled = KNeighborsRegressor(n_neighbors=5)
    model_knn_scaled.fit(X_train_scaled, y_train)
    y_pred_knn_scaled = model_knn_scaled.predict(X_test_scaled)
    mse_knn_scaled = mse(y_test, y_pred_knn_scaled)

    # Output results for both models
    print(f"\n{scaler_name}:")
    print(f"  k-NN MSE with scaling: {mse_knn_scaled:.4f}")


MinMaxScaler:
  k-NN MSE with scaling: 558.2691

StandardScaler:
  k-NN MSE with scaling: 372.3214

RobustScaler:
  k-NN MSE with scaling: 142.4257
