In [1]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score

In [2]:
MODEL_FILE = 'model.pkl'
PIPELINE_FILE = 'pipeline.pkl'

In [3]:
def build_pipeline(num_attribs, cat_attribs):
  num_pipeline = Pipeline([
      ('impute', SimpleImputer(strategy='median')),
      ('scaler', StandardScaler())
  ])

  cat_pipeline = Pipeline([
      ('onehot', OneHotEncoder(handle_unknown='ignore'))
  ])

  full_pipeline = ColumnTransformer([
      ('num', num_pipeline, num_attribs),
      ('cat', cat_pipeline, cat_attribs)
  ])

  return full_pipeline

In [5]:
if not os.path.exists(MODEL_FILE):
  # TRAINING PHASE
  df = pd.read_csv('housing.csv')

  df['income_cat'] = pd.cut(df["median_income"], bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5])
  split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
  for train_index, test_index in split.split(df, df['income_cat']):
    df_train = df.loc[train_index].drop(['income_cat'], axis=1)
    df.loc[test_index].drop(['income_cat'], axis=1).to_csv('input.csv', index=False)

  df_label = df_train["median_house_value"]
  df_features = df_train.drop("median_house_value", axis=1)

  num_attribs = [feature for feature in df_features.columns if df_features[feature].dtypes != 'object']
  cat_attribs = [feature for feature in df_features.columns if df_features[feature].dtypes == 'object']

  pipeline = build_pipeline(num_attribs, cat_attribs)
  df_prepared = pipeline.fit_transform(df_features)

  model = RandomForestRegressor(random_state=42)
  model.fit(df_prepared, df_label)

  # Save model and pipeline
  joblib.dump(model, MODEL_FILE)
  joblib.dump(pipeline, PIPELINE_FILE)

else:
  # INFERENCE PHASE

  model = joblib.load(MODEL_FILE)
  pipeline = joblib.load(PIPELINE_FILE)

  input_data = pd.read_csv('input.csv')
  transformed_input = pipeline.transform(input_data.drop('median_house_value', axis=1))
  predictions = model.predict(transformed_input)
  input_data['median_house_value'] = predictions

  input_data.to_csv('output.csv', index=False)
  print("Inference complete. Results saved to output.csv")

Inference complete. Results saved to output.csv
