In [1]:
!pip -q install google-cloud-storage joblib scikit-learn pandas


In [3]:
import pandas as pd
from google.colab import files
import io

# 1. Upload data
print("Please upload the Housing.csv file.")
uploaded = files.upload()

# Assuming 'Housing.csv' is the file uploaded
file_name = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# 2. Look at the first few rows
df.head()

Please upload the Housing.csv file.


Saving Housing.csv to Housing.csv


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import pandas as pd
import numpy as np # Import numpy for square root calculation

TARGET = "price"

# 1. Separate features and target
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Apply one-hot encoding to categorical columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Create model
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1,
)

# 4. Train
model.fit(X_train, y_train)

# 5. Evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

# 6. Save model
joblib.dump(model, "model.joblib")
print("Model saved as model.joblib")

RMSE: 1398856.7956786193
Model saved as model.joblib


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import numpy as np
import pandas as pd

TARGET = "price"

# 1. Drop missing values
df = df.dropna()

# 2. Separate features and target
X = df.drop(columns=[TARGET])
y = df[TARGET]

# 3. Convert categorical columns to numeric (VERY IMPORTANT)
X = pd.get_dummies(X, drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. Train model
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# 6. Evaluate
pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))
print("RMSE:", rmse)

# 7. Save model
joblib.dump(model, "model.joblib")
print("Model saved as model.joblib")


RMSE: 1398856.7956786193
Model saved as model.joblib


In [8]:
from google.colab import auth
auth.authenticate_user()

In [14]:
from google.cloud import storage

# Ensure this bucket exists in your Google Cloud Project.
BUCKET_NAME = "housing-ml-project-id-ml-bucket" # <--- PLEASE UPDATE THIS WITH YOUR ACTUAL BUCKET NAME
DEST_PATH = "models/model.joblib"

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(DEST_PATH)
blob.upload_from_filename("model.joblib")

print(f"Uploaded to: gs://{BUCKET_NAME}/{DEST_PATH}")

Uploaded to: gs://housing-ml-project-id-ml-bucket/models/model.joblib


In [15]:
sample = X_test.iloc[:1]
print("Sample input:", sample.to_dict(orient="records"))
print("Prediction:", model.predict(sample))


Sample input: [{'area': 5900, 'bedrooms': 4, 'bathrooms': 2, 'stories': 2, 'parking': 1, 'mainroad_yes': False, 'guestroom_yes': False, 'basement_yes': True, 'hotwaterheating_yes': False, 'airconditioning_yes': False, 'prefarea_yes': False, 'furnishingstatus_semi-furnished': False, 'furnishingstatus_unfurnished': True}]
Prediction: [5456764.20138889]
