In [1]:
%load_ext autoreload
%autoreload 2

#!pip install openml --quiet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import openml
from gradient_descent import GDRegressor
from knn import KNN
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import time

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
#Importing dataset from openml
diamonds = openml.datasets.get_dataset(42225)

  diamonds = openml.datasets.get_dataset(42225)


In [5]:
X, y, categorical_indicator, attribute_names = diamonds.get_data(
    target=diamonds.default_target_attribute,
    dataset_format='dataframe'
)

X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [6]:
#Checking for missing values
X.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
dtype: int64

In [7]:
#Checking the distribution of the target variable
y.value_counts()

price
605      132
802      127
625      126
828      125
776      124
        ... 
8816       1
14704      1
14699      1
14698      1
9793       1
Name: count, Length: 11602, dtype: int64

In [8]:
if 'price' in X.columns:
    X = X.drop('price', axis=1)

#Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
le = preprocessing.LabelEncoder()
for column in ['cut', 'color', 'clarity']:
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
start_time = time.time()
model = GDRegressor(learning_rate=0.1, max_iter=1000)
model.fit(X_train_scaled, y_train)
end_time = time.time()
print(f"Custom Model: Execution Time = {end_time - start_time} seconds")
predictions = model.predict(X_test_scaled)

#Plotting cost history
model.plot_cost_history()

In [None]:
plt.figure(figsize=(12, 6))

#Scatter plot for Gradient Descent predictions
plt.subplot(1, 2, 1)
plt.scatter(y_test, predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('Gradient Descent: True vs Predicted Values')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red')
plt.tight_layout()
plt.show()

In [None]:
#SGDRegressor
start_time = time.time()
sgd_model = SGDRegressor()
sgd_model.fit(X_train_scaled, y_train)
end_time = time.time()
print(f"SGDRegressor: Execution Time = {end_time - start_time} seconds")
sgd_predictions = sgd_model.predict(X_test_scaled)

In [None]:
#RandomForestRegressor
start_time = time.time()
rf_model = RandomForestRegressor()
rf_model.fit(X_train_scaled, y_train)
end_time = time.time()
print(f"RandomForestRegressor: Execution Time = {end_time - start_time} seconds")
rf_predictions = rf_model.predict(X_test_scaled)

In [None]:
#Computing metrics
custom_mse = mean_squared_error(y_test, predictions)
sgd_mse = mean_squared_error(y_test, sgd_predictions)
rf_mse = mean_squared_error(y_test, rf_predictions)

custom_r2 = r2_score(y_test, predictions)
sgd_r2 = r2_score(y_test, sgd_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print(f"Custom Model: MSE = {custom_mse}, R2 = {custom_r2}")
print(f"SGDRegressor: MSE = {sgd_mse}, R2 = {sgd_r2}")
print(f"RandomForestRegressor: MSE = {rf_mse}, R2 = {rf_r2}")

In [None]:
#KNN

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

start_time = time.time()
knn = KNN(k=3)
knn.fit(X_train_scaled, y_train)
end_time = time.time()
print(f"Custom KNN Fit: Execution Time = {end_time - start_time} seconds")
knn_predictions = knn.predict(X_test_scaled)

#Calculating performance metrics
knn_mse = mean_squared_error(y_test, knn_predictions)
knn_r2 = r2_score(y_test, knn_predictions)
print(f"KNN: MSE = {knn_mse}, R2 = {knn_r2}")

In [None]:
#Scatter plot for KNN predictions
plt.subplot(1, 2, 2)
plt.scatter(y_test, knn_predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('KNN: True vs Predicted Values')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red')
plt.tight_layout()
plt.show()

In [None]:
#KNeighborsRegressor

knn_sklearn = KNeighborsRegressor(n_neighbors=3)

start_time = time.time()
knn_sklearn.fit(X_train_scaled, y_train)
end_time = time.time()
print(f"KNeighborsRegressor Fit: Execution Time = {end_time - start_time} seconds")

knn_sklearn_predictions = knn_sklearn.predict(X_test_scaled)

#Calculating performance metrics
knn_sklearn_mse = mean_squared_error(y_test, knn_sklearn_predictions)
knn_sklearn_r2 = r2_score(y_test, knn_sklearn_predictions)
print(f"KNeighborsRegressor: MSE = {knn_sklearn_mse}, R2 = {knn_sklearn_r2}")

In [41]:
import sys
sys.path.append('../')

from janick.knn import KNNRegressor
import scipy.spatial

print(np.argsort(y_train))

start_time = time.time()
model = KNNRegressor(n_neighbors=20)

model.fit(X_train_scaled,y_train)
end_time = time.time()
print(f"Custom KNN Fit: Execution Time = {end_time - start_time} seconds")

pred = model.predict(X_test_scaled)
print(pred)

#Calculating performance metrics
knn_mse = mean_squared_error(y_test, pred, squared=False)
knn_r2 = r2_score(y_test, pred)
print(f"KNN: MSE = {knn_mse}, R2 = {knn_r2}")

26546     2711
9159      7077
14131    32785
15757    20998
24632    28411
         ...  
11284    31149
44732    25481
38158    31157
860      38945
15795    15304
Name: price, Length: 43152, dtype: int64
<class 'numpy.ndarray'>
Custom KNN Fit: Execution Time = 0.04031968116760254 seconds
[ 685.65 2310.95 1171.5  ...  940.85 9785.05 3871.4 ]
KNN: MSE = 850.6871428829035, R2 = 0.9544771502098657


In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

X_normalized = X_train_scaled
y = y_train

X_test_normalized = X_test_scaled

# Initialize a dictionary to store training times
training_times = {}
models = {
    'KNN ours': KNNRegressor(n_neighbors=20),
    'GD ours': GDRegressor(0.1, 1000),
    'KNN': KNeighborsRegressor(n_neighbors=20),
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'SVR': SVR(),
    'SGD': SGDRegressor()
}

# models = {
#     'KNN ours': KNNRegressor(n_neighbors=17, strategy='distance'),
#     'KNN': KNeighborsRegressor(n_neighbors=17, weights='distance')
# }

# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for model_name, model in models.items():
    mse_scores = []
    
    for train_index, val_index in kf.split(X_normalized):
        X_train, X_val = X_normalized.iloc[train_index], X_normalized.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        start_time = time.time()
        model.fit(X_train, y_train)
        end_time = time.time()
        
        training_times[model_name] = end_time - start_time
        
        y_val_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_val_pred, squared=False)
        mse_scores.append(mse)
    
    # Output the mean and standard deviation of the MSE scores
    mean_mse = np.mean(mse_scores)
    std_mse = np.std(mse_scores)
    
    print(f'{model_name} Mean MSE: {mean_mse}')
    print(f'{model_name} Standard Deviation of MSE: {std_mse}')
    
    # Train final model on full training data and evaluate on the test set
    model.fit(X_normalized, y)
    y_test_pred = model.predict(X_test_normalized)
    test_mse = mean_squared_error(y_test, y_test_pred, squared=False)
    test_r2 = r2_score(y_test, y_test_pred)
    
    print(f'{model_name} Test MSE: {test_mse}')
    print(f'{model_name} Test R^2: {test_r2}')
    print(f'{model_name} training time: {training_times[model_name]:.4f} seconds')
    print("-" * 50)

# Print training times
for model_name, train_time in training_times.items():
    print(f"{model_name} training time: {train_time:.4f} seconds")

