In [1]:
import os
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
os.chdir('../../')

In [3]:
selected_data = pd.read_csv('data/selected/data.csv')

In [4]:
X, y = selected_data.drop('Ilg', axis=1), selected_data['Ilg']

In [5]:
reg = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('estimator', KNeighborsRegressor())
])

In [6]:
params = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'estimator__n_neighbors': [1, 3, 5, 7, 9, 11, 15],
    'estimator__weights': ['uniform', 'distance'],
    'estimator__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'estimator__leaf_size': [10, 20, 30, 40, 50],
    'estimator__metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
    'estimator__p': [1, 2],
    'estimator__metric_params': [None, {'p': 2}, {'p': 1}] 
}

In [8]:
with mlflow.start_run():
    
    grid = GridSearchCV(estimator=reg, param_grid=params, scoring='neg_mean_squared_error', cv=5)
    grid.fit(X, y)

    mlflow.log_param('model_type', 'regression')
    for param, value in grid.best_params_.items():
        mlflow.log_param(param, value)
    
    mlflow.log_metric("best_score", np.sqrt(-grid.best_score_))
    mlflow.sklearn.log_model(grid.best_estimator_, 'KNeighborsRegressor')

    print('Best score: ', np.sqrt(-grid.best_score_))
    print('Best params: ', grid.best_params_)

    # Dado que KNeighborsRegressor no tiene coeficientes, se puede graficar la importancia de las características
    # utilizando la media de las distancias a los vecinos más cercanos.
    # Aquí se puede calcular la importancia de las características de otra manera, por ejemplo, usando la media de las distancias.
    
    distances, indices = grid.best_estimator_.named_steps['estimator'].kneighbors(X)
    mean_distances = np.mean(distances, axis=1)

    feature_importance = np.mean(np.abs(X), axis=0)
    
    plt.figure(figsize=(10, 8))
    sns.barplot(x=feature_importance, y=X.columns, palette='viridis') 
    plt.title('Feature Importance', fontsize=16)
    plt.xlabel('Mean', fontsize=14)
    plt.ylabel('Features', fontsize=14)
    plt.axvline(0, color='grey', linestyle='--')

    mlflow.log_figure(plt.gcf(), 'feature_importance.png')
    plt.show()

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


KeyboardInterrupt: 