In [17]:
import pandas as pd

In [18]:
import numpy as np

In [19]:
from sklearn.model_selection import StratifiedShuffleSplit

In [20]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [21]:
from sklearn.ensemble import RandomForestRegressor

In [22]:
from sklearn.impute import SimpleImputer

In [23]:
from sklearn.pipeline import Pipeline

In [24]:
from sklearn.compose import ColumnTransformer

In [25]:
import os

In [26]:
import joblib

In [27]:
model_file=("model.pkl")

In [28]:
pipeline_file=("pipeline.pkl")

In [29]:
def build_pipeline(num_attr, cat_attr):
    num_pipeline=Pipeline([
        ("imputer",SimpleImputer(strategy="median")),
        ("standard", StandardScaler())
    ])
    cat_pipeline=Pipeline([
        ("OneHotEnc", OneHotEncoder(handle_unknown='ignore'))
    ])
    full_pipeline=ColumnTransformer([
        ("num", num_pipeline, num_attr),
        ("cat", cat_pipeline, cat_attr)
    ])
    return full_pipeline

In [52]:
if not os.path.exists(model_file):
    df=pd.read_csv(r"C:\Users\sarth\Downloads\AI\housing.csv")
    df['income_cat']=pd.cut(
        df['median_income'],
        bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
        labels=[1,2,3,4,5]
    )
    split=StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index,_ in split.split(df, df['income_cat']):
        housing=df.loc[train_index].drop("income_cat", axis=1)
    housing_label=housing['median_house_value'].copy()
    housing_features=housing.drop("median_house_value",axis=1)
    cat_attr=['ocean_proximity']
    num_attr=housing_features.drop("ocean_proximity",axis=1).columns.tolist()
    pipeline=build_pipeline(num_attr, cat_attr)
    housing_prepared=pipeline.fit_transform(housing_features)
    model=RandomForestRegressor(random_state=42)
    model.fit(housing_prepared, housing_label)
    joblib.dump(model, model_file)
    joblib.dump(pipeline, pipeline_file)
    print("Model Trained and Saved")
else:
    model=joblib.load(model_file)
    pipeline=joblib.load(pipeline_file)
    input=pd.read_csv('input.csv')
    transformed_input=pipeline.transform(input)
    predictions=model.predict(transformed_input)
    input["median_house_value"]=predictions
    input.to_csv("output.csv", index=False)
    print("Inference complete. Results saved to output.csv")

Inference complete. Results saved to output.csv
