# CARGA DE LIBRERIAS

In [49]:
import pandas as pd
import numpy as np
import seaborn as sns

# Preprocesados

from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Imputadores

from sklearn.impute import SimpleImputer

# Regresion y Clasificacion

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Metricas

from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error, accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

#Pipelines

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib


## CARGA DE DATASET

In [50]:
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/diamonds.csv'
df = pd.read_csv(url)

In [51]:
X = df.drop('price', axis=1)
y = df['price']

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


## Columnas numericas y categoricas

No hago train split porque voy a entrenar con todas

In [53]:
numerical_col = X.select_dtypes(include=[np.number]).columns.to_list()
categorical_col = X.select_dtypes(exclude=[np.number]).columns.to_list()

print('Columnas numericas', numerical_col)
print('Columnas categoricas', categorical_col)

Columnas numericas ['carat', 'depth', 'table', 'x', 'y', 'z']
Columnas categoricas ['cut', 'color', 'clarity']


## CREACION DEL PIPELINE

In [54]:
pipeline_categorical = Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
            ])

pipeline_numerical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()),
])
pipeline_all = ColumnTransformer([
    ('numeric', pipeline_numerical, numerical_col),
    ('categorical', pipeline_categorical, categorical_col)
])
pipeline = Pipeline([
    ('pipeline', pipeline_all),
    ('modelo', RandomForestRegressor())]
)

In [55]:
pipeline.fit(X, y)
print('R2 en train', pipeline.score(X, y))
joblib.dump(pipeline, 'pipeline_regresion.joblib')

R2 en train 0.9974295298710973


['pipeline_regresion.joblib']

In [56]:
X_new = pd.DataFrame({
            'carat': [0.23],
            'cut': ['Ideal'],
            'color': ['E'],
            'clarity': ['SI2'],
            'depth':[61.5],
            'table': [55],
            'x':[3.95],
            'y':[3.98],
            'z':[2.43],                 
        })

y_pred = pipeline.predict(X_new)

y_pred

array([392.62])