## Import Libraries

In [135]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance

## Load Dataset

In [136]:
data = load_wine().data
target = load_wine().target

f_names = load_wine().feature_names
f_targets = load_wine().target_names

In [137]:
df = pd.DataFrame(data, columns=f_names)

## Preprocessing

In [138]:
# train_test split
y = df['color_intensity']
X = df.drop(['color_intensity'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42,
                                                    )

In [139]:
# scale the features
mms = MinMaxScaler()
mms.fit(X_train)
X_train = mms.transform(X_train)
X_test = mms.transform(X_test)

### Outlier Detection and Removal with IsolationForest

> IsolationForest: This is an ensemble method used for anomaly detection (outlier detection). It works by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. The key idea is that outliers are "few and different," so they can be isolated quickly. The more isolation (splits) required to separate a data point, the more likely it is an inlier.

In [140]:
# Detect outliers
iso = IsolationForest(contamination=0.05, random_state=42)  # contamination specifies the proportion of the dataset that is expected to be outliers
yhat = iso.fit_predict(X_train)
# -1: Indicates that the data point is considered an outlier (anomalous)
mask = yhat != -1
# X_train[mask, :]: Filters the X_train array, keeping only the rows (data points) where the mask is True (inliers)
X_train, y_train = X_train[mask, :], y_train[mask]

### Polynomial Features


> PolynomialFeatures: This is a utility provided by scikit-learn that generates new features by taking existing features and creating interactions between them, as well as raising them to a certain power (degree). It's a common technique in machine learning to allow models to capture non-linear relationships in the data.



> degree=2: This parameter specifies the degree of the polynomial features. A degree of 2 means that it will create:

*   Original features (degree 1)
*   Squared features (degree 2)
*   Interaction features (product of pairs of features).

In [141]:
# Polynomial Features (Adjust degree or features as necessary)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

### Dimensionality Reduction

In [142]:
# Dimensionality Reduction with PCA
pca = PCA(n_components=0.97)  # Retain 97% of variance
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

### Hyperparameter Tuning

In [143]:
# Hyperparameter Tuning with Grid Search
param_grid = {
    'n_neighbors': range(3, 11),
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # Only relevant for Minkowski metric
}

grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=10, scoring='r2', verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

Fitting 10 folds for each of 96 candidates, totalling 960 fits
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 8, 'p': 1, 'weights': 'distance'}


## Model Training

In [144]:
# Train the model
knn_model = KNeighborsRegressor(n_neighbors=best_params['n_neighbors'],
                                metric=best_params['metric'],
                                weights=best_params['weights'],
                                p=best_params.get('p', 2))

knn_model.fit(X_train, y_train)

In [145]:
# Predict and evaluate
y_pred = knn_model.predict(X_test)

# evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'R2: {r2}, MSE: {mse}, MAE: {mae}')

R2: 0.6098865986616724, MSE: 2.006912280727789, MAE: 1.065897805687784


## Visualization

In [146]:
# 1. Actual vs. Predicted Values Plot (Plotly)
fig_actual_vs_pred = go.Figure()

# Scatter plot of actual vs predicted values
fig_actual_vs_pred.add_trace(go.Scatter(
    x=y_test,
    y=y_pred,
    mode='markers',
    marker=dict(color='blue', opacity=0.6),
    name='Predicted vs Actual'
))

# Add a diagonal line representing perfect predictions
fig_actual_vs_pred.add_trace(go.Scatter(
    x=[min(y_test), max(y_test)],
    y=[min(y_test), max(y_test)],
    mode='lines',
    line=dict(color='red', dash='dash'),
    name='Perfect Prediction'
))

fig_actual_vs_pred.update_layout(
    title='Actual vs. Predicted Values',
    xaxis_title='Actual Values',
    yaxis_title='Predicted Values',
    showlegend=True
)

fig_actual_vs_pred.show()



> A Residual Plot is crucial for validating the assumptions of a regression model and ensuring that the model is performing well. In a well-fitted model, residuals should be randomly scattered around zero, indicating that the model captures the underlying data patterns without systematic bias.

*Mathematically:  Residual = Actual Value − Predicted Value*

Residuals measure the error in your predictions. A residual of 0 means the prediction was perfect for that data point.

In [147]:
# 2. Residual Plot (Plotly)
residuals = y_test - y_pred

fig_residuals = go.Figure()

# Scatter plot of residuals vs predicted values
fig_residuals.add_trace(go.Scatter(
    x=y_pred,
    y=residuals,
    mode='markers',
    marker=dict(color='purple', opacity=0.6),
    name='Residuals'
))

# Add a horizontal line at zero
fig_residuals.add_trace(go.Scatter(
    x=[min(y_pred), max(y_pred)],
    y=[0, 0],
    mode='lines',
    line=dict(color='red', dash='dash'),
    name='Zero Error Line'
))

fig_residuals.update_layout(
    title='Residual Plot',
    xaxis_title='Predicted Values',
    yaxis_title='Residuals',
    showlegend=True
)

fig_residuals.show()

In [148]:
# 3. Distribution of Residuals (Plotly)
fig_residual_dist = ff.create_distplot([residuals], ['Residuals'], bin_size=0.1, colors=['purple'])
fig_residual_dist.update_layout(title='Distribution of Residuals')
fig_residual_dist.show()

In [149]:
# 4. Feature Importance (Approximation using Permutation Importance)
perm_importance = permutation_importance(knn_model, X_test, y_test, n_repeats=10, random_state=42)
feature_importance_values = perm_importance.importances_mean

fig_feature_importance = go.Figure()

fig_feature_importance.add_trace(go.Bar(
    x=X.columns,
    y=feature_importance_values,
    marker=dict(color='green')
))

fig_feature_importance.update_layout(
    title='Feature Importance (Approximation)',
    xaxis_title='Features',
    yaxis_title='Importance'
)

fig_feature_importance.show()

In [150]:
# 5. 3D Scatter Plot (Using three features)

# Create the 3D scatter plot
fig_3d = go.Figure()

fig_3d.add_trace(go.Scatter3d(
    x=X_test[:, 0],  # Alcohol feature (Index 0)
    y=X_test[:, 9],  # Color Intensity feature (Index 9)
    z=X_test[:, 1],  # Malic Acid feature (Index 1)
    mode='markers',
    marker=dict(size=5, color=y_pred, colorscale='Viridis', opacity=0.8),
    text=y_test,
    name='3D Scatter'
))

fig_3d.update_layout(
    title='3D Scatter Plot of Predictions',
    scene=dict(
        xaxis_title='Alcohol',
        yaxis_title='Color Intensity',
        zaxis_title='Malic Acid'
    )
)

fig_3d.show()