In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#SK-Learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
#Scalers
#https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
from sklearn.preprocessing import StandardScaler,MinMaxScaler
#Modelos Lineales
# https://scikit-learn.org/stable/modules/linear_model.html
from sklearn.linear_model import LinearRegression , Ridge
from sklearn.svm import SVR
#Metricas
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [8]:
data = pd.read_csv(r'C:\Users\SANTI\Desktop\UTN\Quinto Año\Ciencia de datos\Ejercicio Martin 17-10\Ejercicio-modelos-de-regresion-main\Xy_train.csv', delimiter=",")

In [9]:
data

Unnamed: 0,X,y
0,2.273360,6.054685
1,3.167583,4.581428
2,7.973655,5.392507
3,6.762547,3.108068
4,3.911096,4.225744
5,3.328139,5.404029
6,5.983088,3.804639
7,1.867342,5.753330
8,6.727560,4.649223
9,9.418029,11.237918


In [11]:
x = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [12]:
print("Los primeros 5 valores del array x: \n", x[:5])
print("\n")
print("Los primeros 5 valores del array y: \n", y[:5])

Los primeros 5 valores del array x: 
 [[2.27336022]
 [3.1675834 ]
 [7.97365457]
 [6.76254671]
 [3.91109551]]


Los primeros 5 valores del array y: 
 [6.05468511 4.58142822 5.39250705 3.10806751 4.22574359]


In [13]:
np.shape(x)

(100, 1)

In [14]:
np.shape(y)

(100,)

In [15]:
dataT = pd.read_csv(r'C:\Users\SANTI\Desktop\UTN\Quinto Año\Ciencia de datos\Ejercicio Martin 17-10\Ejercicio-modelos-de-regresion-main\X_test.csv', delimiter=",")

In [16]:
dataT.head(5)

Unnamed: 0,X
0,6.1708
1,6.302022
2,8.689293
3,2.376897
4,2.70732


In [19]:
x_pred = dataT.iloc[:,:,].values

In [20]:
print("Los primeros 5 valores del array x train: \n", x_pred[:5])

Los primeros 5 valores del array x train: 
 [[6.17080018]
 [6.30202215]
 [8.68929334]
 [2.37689724]
 [2.70731977]]


In [21]:
np.shape(x_pred)

(20, 1)

In [22]:
# Separamos set de entrenamiento y testeo
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.45, random_state=1)

In [23]:
# Definimos el escalador 
scaler = StandardScaler()
# Fiteamos para los x_train
scaler = scaler.fit(x_train)

In [24]:
# Transformamos los x_train
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [25]:
# Vamos a crear un dataframe para guardar los resultados de cada estimador
results_df = pd.DataFrame(columns=['Model','Features','R2','MSE','MAE'])
results_df

Unnamed: 0,Model,Features,R2,MSE,MAE


# Regresión Lineal

In [26]:
#Definimos el estimador
est = LinearRegression()
#Definimos diccionario con hyperparametros
parameters = {'fit_intercept' : [False, True]}
#Definimos la cantidad de folds para validar
n_folds = 5
#Definimos el objeto grid search con los parametros anteriores
gs = GridSearchCV(est, param_grid=parameters, cv=n_folds, refit=True, scoring="neg_mean_squared_error")

In [27]:
#Entrenamos
gs.fit(x_train_scaled, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'fit_intercept': [False, True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [28]:
# Mostramos el mejor estimador, sus hyperparametros seleccionados y su resultado
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")
print(gs.best_score_, "\n")

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False) 

{'fit_intercept': True} 

-3.5312076237260905 



In [29]:
# Usamos x_test para predecir con el mejor estimador
linear_prediction = gs.best_estimator_.predict(x_test_scaled)
#Calculamos las metricas R2, MSE y MAE
linear_r2 = r2_score(y_true=y_test, y_pred=linear_prediction)
linear_mse = mean_squared_error(y_true=y_test, y_pred=linear_prediction)
linear_mae = mean_absolute_error(y_true=y_test, y_pred=linear_prediction)

In [30]:
print(f'R2 score: {linear_r2:.6f}')
print(f'MAE: {linear_mae:.6f}')
print(f'MSE: {linear_mse:.6f}')

R2 score: 0.411095
MAE: 1.488374
MSE: 3.841104


In [31]:
#Guardamos los resultados
results_df = results_df.append({'Model':'Linear',
                                'Features':'Lineal',
                                'R2':linear_r2,
                                'MSE':linear_mse,
                                'MAE':linear_mae},ignore_index=True)
results_df

Unnamed: 0,Model,Features,R2,MSE,MAE
0,Linear,Lineal,0.411095,3.841104,1.488374


# Regresion Ridge

In [32]:
est = Ridge()
# Lista del parametro lambda (llamado 'alpha' en el GridSearch )
lambdas = [0.001,0.005,0.01,0.02,0.05,0.1,0.2,0.3,0.4,0.5,1]
# Juntamos el diccionario de parametros
parameters = {'alpha': lambdas}
# Definimos nuevamente el objeto GS con los parametros previamente definidos
gs = GridSearchCV(est, param_grid=parameters,refit=True, cv=n_folds, scoring="neg_mean_squared_error")

In [33]:
# Entrenamos
gs.fit(x_train_scaled, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [34]:
# Mostramos el mejor estimador, sus hyperparametros seleccionados y su resultado
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")
print(gs.best_score_, "\n")

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001) 

{'alpha': 1} 

-3.5264614355737947 



In [35]:
#Predicciones + Metricas
ridge_prediction = gs.best_estimator_.predict(x_test_scaled)
ridge_r2 = r2_score(y_true=y_test, y_pred=ridge_prediction)
ridge_mse = mean_squared_error(y_true=y_test, y_pred=ridge_prediction)
ridge_mae = mean_absolute_error(y_true=y_test, y_pred=ridge_prediction)

In [36]:
print(f'R2 score: {ridge_r2:.6f}')
print(f'MAE: {ridge_mse:.6f}')
print(f'MSE: {ridge_mae:.6f}')

R2 score: 0.408755
MAE: 3.856371
MSE: 1.490024


In [37]:
#Guardamos resultados
results_df = results_df.append({'Model':'Ridge',
                                'Features':'Lineal',
                                'R2':ridge_r2,
                                'MSE':ridge_mse,
                                'MAE':ridge_mae},ignore_index=True)

In [38]:
results_df.drop(results_df.index[2:5],0,inplace=True)

In [39]:
results_df

Unnamed: 0,Model,Features,R2,MSE,MAE
0,Linear,Lineal,0.411095,3.841104,1.488374
1,Ridge,Lineal,0.408755,3.856371,1.490024


# Regresion Support Vector

In [40]:
est = SVR(max_iter=25000)
# Lista del parametro lambda (parametro 'alpha')
parameters = {'C' : [1000,1500,2000,3000,5000],
              'epsilon' : [0.001, 0.01, 0.1,1, 10,100],
             'gamma':[0.001, 0.01, 0.1,1, 10,100]}
# Definimos nuevamente el objeto GS con los parametros previamente definidos
gs = GridSearchCV(est, param_grid=parameters,refit=True,
                  cv=n_folds, scoring="neg_mean_squared_error",
                  verbose=3, n_jobs=3)

In [41]:
# Entrenamos
gs.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  29 tasks      | elapsed:    2.3s
[Parallel(n_jobs=3)]: Done 900 out of 900 | elapsed:    4.9s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=25000, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'C': [1000, 1500, 2000, 3000, 5000], 'epsilon': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=3)

In [42]:
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")
print(gs.best_score_, "\n")

SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
  kernel='rbf', max_iter=25000, shrinking=True, tol=0.001, verbose=False) 

{'C': 1000, 'epsilon': 0.1, 'gamma': 0.1} 

-0.6561644034079556 



In [43]:
# Prediccion + Metricas
svr_prediction = gs.best_estimator_.predict(x_test_scaled)
svr_r2 = r2_score(y_true=y_test, y_pred=svr_prediction)
svr_mse = mean_squared_error(y_true=y_test, y_pred=svr_prediction)
svr_mae = mean_absolute_error(y_true=y_test, y_pred=svr_prediction)

In [44]:
print(f'R2 score: {svr_r2:.6f}')
print(f'MAE: {svr_mae:.6f}')
print(f'MSE: {svr_mse:.6f}')

R2 score: 0.912160
MAE: 0.607194
MSE: 0.572930


In [45]:
results_df = results_df.append({'Model':'SVR',
                                'Features':'Linear',
                                'R2':svr_r2,
                                'MSE':svr_mse,
                                'MAE':svr_mae},ignore_index=True)

In [46]:
results_df

Unnamed: 0,Model,Features,R2,MSE,MAE
0,Linear,Lineal,0.411095,3.841104,1.488374
1,Ridge,Lineal,0.408755,3.856371,1.490024
2,SVR,Linear,0.91216,0.57293,0.607194


In [48]:
results_df

Unnamed: 0,Model,Features,R2,MSE,MAE
0,Linear,Lineal,0.411095,3.841104,1.488374
1,Ridge,Lineal,0.408755,3.856371,1.490024
2,SVR,Linear,0.91216,0.57293,0.607194


In [49]:
#### Elijo SVR para predecir los resultados de x_pred ####

In [50]:
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")

SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
  kernel='rbf', max_iter=25000, shrinking=True, tol=0.001, verbose=False) 

{'C': 1000, 'epsilon': 0.1, 'gamma': 0.1} 



In [51]:
svr_prediction = gs.best_estimator_.predict(x_pred)

In [52]:
svr_prediction

array([26.0626574 , 24.75332454, 10.64254123, 19.93175723, 25.20231974,
       22.32180796, 11.10656014,  4.00796065,  9.89952789, 24.69890603,
       32.86873747, 23.63968284,  9.85025029,  3.55528453, 38.79743123,
       12.29593666, 37.27164459, 27.65681668, 17.42496266, 24.97247053])