In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
df = pd.read_csv('../data/2008_small.zip',index_col = 0)
df.head()

In [None]:
df['ArrDelay'].isna().sum()

In [None]:
df=df[df['ArrDelay'].notnull()]
df.isna().sum()

In [None]:
data = df.drop(labels=['ArrDelay'], axis=1)
target = df['ArrDelay']

In [None]:
data.isna().sum()

In [None]:
nan_data=data[data.columns[data.isna().any()]]

nan_data.columns

In [None]:
data=data.drop(labels=(nan_data.columns), axis=1)

In [None]:
data.isna().sum()

In [None]:
data.shape

In [None]:
target.describe(), target.isna().sum(), target.shape

In [None]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

# Eliminamos DepDelay para comprobar su impacto
# del numerical_columns[ numerical_columns.index('DepDelay')]

In [None]:
numerical_columns

In [None]:
categorical_columns

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [None]:
# Creamos el transformador y asociamos cada uno de estos preprocesadores con sus respectivas columnas.
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

# Training error vs testing error

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, LinearRegression())
model

In [None]:
# dividimos nuestro conjunto de datos.

from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=0)

In [None]:
# entrenamos nuestro modelo.

model.fit(data_train, target_train)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

# el error de entrenamiento.

target_predicted = model.predict(data_train)
score = mean_absolute_error(target_train, target_predicted)
r2 = r2_score(target_train, target_predicted)
print(f"Los errores de entrenamiento son {score:.6f} y {r2:.6f}")

In [None]:
target_train.values-target_predicted

In [None]:
# el error de prueba.

target_predicted = model.predict(data_test)
score = mean_absolute_error(target_test, target_predicted)
r2 = r2_score(target_test, target_predicted)
print(f"Los errores de prueba son {score:.6f} y {r2:.6f}")


In [None]:
plt.scatter(range(len(target)), target, color='blue')
plt.scatter(range(len(target_predicted)), target_predicted, color='red')
plt.title('Target vs predicted')
plt.show()

## Cross validation

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
cv_results = cross_validate(
    model, data, target, cv=cv, scoring="neg_mean_absolute_error")

In [None]:
import pandas as pd

cv_results = pd.DataFrame(cv_results)
cv_results.head()

In [None]:
# revertimos los negativos
cv_results["test_error"] = -cv_results["test_score"]

# verificamos los resultados
cv_results.head(10)

In [None]:
len(cv_results)

In [None]:
# Obtenemos 40 entradas en nuestro marco de datos resultante porque realizamos 40 divisiones.
# Por lo tanto, podemos mostrar la distribución del error de prueba y así tener una estimación de su variabilidad.

import matplotlib.pyplot as plt

cv_results["test_error"].plot.hist(bins=10, edgecolor="black")
plt.xlabel("Mean absolute error (k$)")
_ = plt.title("Test error distribution")

In [None]:
print(f"La media del cross-validated testing error es: "
      f"{cv_results['test_error'].mean():.2f}")

In [None]:
print(f"La desviación estandar del testing error es: "
      f"{cv_results['test_error'].std():.2f}")

In [None]:
# Tracemos la distribución de la variable target

target.plot.hist(edgecolor="black")
plt.xlabel("Median ArrDelay")
_ = plt.title("Target distribution")

In [None]:
print(f"La desviación estándar del target es: {target.std():.2f}")

In [None]:
# En el caso de que solo esté interesado en el score de la prueba,
# scikit-learn proporciona la función cross_val_score.
# Es idéntico a llamar a la función cross_validate y seleccionar solo test_score.

from sklearn.model_selection import cross_val_score

scores = cross_val_score(regressor, data, target)
scores