## CASO 1: AUTOS

In [1]:
# Importar bibliotecas necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

### Cargar los datos

In [5]:
auto_data = pd.read_csv("auto.csv", sep=',')

# Exploración de los datos
print(auto_data.info())
print(auto_data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cylinders     392 non-null    int64  
 1   displacement  392 non-null    float64
 2   horsepower    392 non-null    float64
 3   weight        392 non-null    float64
 4   acceleration  392 non-null    float64
 5   model_year    392 non-null    int64  
 6   origin        392 non-null    int64  
 7   mpg           392 non-null    float64
dtypes: float64(5), int64(3)
memory usage: 24.6 KB
None
        cylinders  displacement  horsepower       weight  acceleration  \
count  392.000000    392.000000  392.000000   392.000000    392.000000   
mean     5.471939    194.411990  104.469388  2977.584184     15.541327   
std      1.705783    104.644004   38.491160   849.402560      2.758864   
min      3.000000     68.000000   46.000000  1613.000000      8.000000   
25%      4.000000    105.000000   75.000

In [3]:
# Preprocesamiento del input auto_data
# Conversión de unidades de mpg a l/100KM
auto_data['consumo_l_100km'] = 235.214 / auto_data['mpg']

# Separar las características (X) y la variable objetivo (y)
X = auto_data.drop(columns=['mpg', 'consumo_l_100km'])
y = auto_data['mpg']


# Identificar variables categóricas y numéricas
cat_features = ['origin'] 
num_features = list(set(X.columns) - set(cat_features))


# Crear un pipeline para preprocesar las variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ]
)


# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear pipelines para diferentes modelos
linear_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])


In [4]:
# Entrenar y evaluar los modelos
models = {'Linear Regression': linear_model, 'Random Forest': rf_model}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name}:")
    print(f"R2 Score: {r2_score(y_test, y_pred)}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}\n")

Linear Regression:
R2 Score: 0.7922774714022582
RMSE: 3.2561140968473996

Random Forest:
R2 Score: 0.8876268222741278
RMSE: 2.3949097126291776

