In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# El objetivo será predecir el precio de los coches marca audi
# Empezamos obteniendo los datos
audi_data = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/audi.csv')
audi_data.head()

In [None]:
audi_data.info()

In [None]:
audi_data.describe()

In [None]:
# Observamos las distribuciones
audi_data.hist(bins = 50, figsize=(20,15))

In [None]:
audi_data['model'].value_counts()

In [None]:
audi_data['transmission'].value_counts()

In [None]:
audi_data['fuelType'].value_counts() 

In [None]:
# Dividimos los datos en entrenamiento y test
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(audi_data, test_size = 0.2, random_state = 42)
len(train_set), len(test_set)

In [None]:
#Observamos correlaciones
corr_matrix = audi_data.corr()
corr_matrix['price'] # Vemos una correlación interesante con el precio 

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(audi_data, figsize = (20,12))

In [None]:
# Data cleaning, no tenemos valores faltantes. Separamos las variables predictoras de las etiquetas
audi_predictors = train_set.drop('price', axis = 1)
audi_labels = train_set['price'].copy()

In [None]:
# Vamos a separar las variables categóricas y las tratamos con One Hot Encoder
audi_cat = audi_predictors[['model', 'transmission', 'fuelType']]
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
audi_cat_1hot = cat_encoder.fit_transform(audi_cat)
audi_predictors_num = audi_predictors.drop(['model', 'transmission', 'fuelType'], axis = 1)
cat_encoder.categories_

In [None]:
# Feature scaling --> como tratar todo con Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([('std_scaler', StandardScaler())])
from sklearn.compose import ColumnTransformer
num_attribs = list(audi_predictors_num)
cat_attribs = list(audi_cat)
full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs)])

audi_prepared = full_pipeline.fit_transform(audi_predictors)
audi_prepared = audi_prepared.toarray()
audi_prepared

In [None]:
#Vamos a realizar un modelo con Random Forest, usando cross-validation y posteriormente encontraremos que parámetros utilizar (Randomized Search)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
rf_reg = RandomForestRegressor()
scores = cross_val_score(rf_reg, audi_prepared, audi_labels, scoring = 'neg_mean_squared_error', cv = 10)
rf_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Std: ", scores.std())

display_scores(rf_rmse_scores)

In [None]:
# Randomized Search
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
params_distrib = {'n_estimators': randint(low=1, high=200), 'max_features':randint(low=1, high=8)}
forest_reg =  RandomForestRegressor(random_state = 42)
rnd_search =  RandomizedSearchCV(forest_reg, param_distributions = params_distrib, cv = 5,
                                scoring = 'neg_mean_squared_error', random_state = 42)

rnd_search.fit(audi_prepared,audi_labels)

In [None]:
rnd_search.best_params_

In [None]:
rnd_search.best_estimator_

In [None]:
feature_importances = rnd_search.best_estimator_.feature_importances_
feature_importances

In [None]:
#Asociamos cada importancia a su categoría
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_) #0 1 2
atrributes = num_attribs + list(cat_one_hot_attribs[0]) + list(cat_one_hot_attribs[1]) + list(cat_one_hot_attribs[2])
importancia = list(sorted(zip(feature_importances, atrributes), reverse = True))
importancia

In [None]:
x_val = [x[0] for x in importancia]
y_val = [x[1] for x in importancia]
import matplotlib.pyplot as plt
plt.figure(figsize=(20,7))
plt.bar(y_val, x_val)
plt.grid(axis = 'y')
plt.xticks(rotation=45)
plt.title('Importancia de las variables para la predicción')
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
final_model = rnd_search.best_estimator_
X_test = test_set.drop('price', axis = 1) 
y_test = test_set['price'].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

In [None]:
# Intervalo de confianza
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors)-1, loc = squared_errors.mean(), scale = stats.sem(squared_errors)))

In [None]:
# first decision tree
from sklearn import tree
tree.plot_tree(final_model.estimators_[0])
plt.show()

In [None]:
# Ejemplo de varias predicciones del conjunto de test
obs_10 = X_test.iloc[:10]
obs_10

In [None]:
obs_10 = X_test_prepared[:10]
obs_10

In [None]:
predicciones = final_model.predict(obs_10)
predicciones

In [None]:
reales = y_test.iloc[:10].values
reales

In [None]:
plt.figure(figsize = (12,5))
plt.plot(range(0,10), reales)
plt.plot(range(0,10), predicciones)
plt.xticks(range(0,10))
plt.legend(['real', 'predecido'])
plt.show()

In [None]:
#25 predicciones
obs_tot = X_test_prepared[:25]
predtot = final_model.predict(obs_tot)
realestot = y_test.iloc[:25].values
plt.figure(figsize = (12,5))
plt.plot(range(0,25), realestot)
plt.plot(range(0,25), predtot)
plt.xticks(range(0,25))
plt.legend(['real', 'predecido'])
plt.show()

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, final_predictions)

In [None]:
# Vamos a probar con SVM
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

svr_reg = SVR()
params = [{'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}]

grid_search = GridSearchCV(svr_reg, params, cv = 5,
                            scoring = 'neg_mean_squared_error',
                            verbose=2) 
grid_search.fit(audi_prepared,audi_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
final_model_svm = grid_search.best_estimator_
final_prediction_svm = final_model_svm.predict(X_test_prepared.toarray())
final_mse_svm = mean_squared_error(y_test, final_prediction_svm)
final_rmse_svm = np.sqrt(final_mse_svm)
final_rmse_svm

In [None]:
plt.figure(figsize = (12,5))
predict = final_model_svm.predict(X_test_prepared.toarray()[:10])
real = y_test.iloc[:10]
plt.plot(range(1,11), real)
plt.plot(range(1,11), predict)

In [None]:
# Intervalo de confianza
from scipy import stats
confidence = 0.95
squared_errors_svm = (final_prediction_svm - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors_svm)-1, loc = squared_errors_svm.mean(), scale = stats.sem(squared_errors_svm)))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

# see https://docs.scipy.org/doc/scipy/reference/stats.html
# for `expon()` and `reciprocal()` documentation and more probability distribution functions.

# Note: gamma is ignored when kernel is "linear"
param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }

svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, random_state=42)
rnd_search.fit(audi_prepared, audi_labels)

In [None]:
negative_mse = rnd_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse