In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
import pandas as pd
import numpy as np
import joblib

In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
housing = pd.read_csv('housing.csv')
housing.drop('Address',axis=1,inplace=True)
housing.rename(columns={'Avg. Area Income': 'Area Income', 'Avg. Area House Age': 'House Age','Avg. Area Number of Rooms':'Number of Rooms',"Avg. Area Number of Bedrooms":"Number of Bedrooms"},inplace=True)

In [None]:
#cleaning data
housing['Area Income']=housing['Area Income'].astype('int')
housing['House Age']=housing['House Age'].astype('int')
housing['Number of Rooms']=housing['Number of Rooms'].astype('int')
housing['Number of Bedrooms']=housing['Number of Bedrooms'].astype('int')
housing['Area Population']=housing['Area Population'].astype('int')
# feature engineering
housing['Area Income Category'] = pd.qcut(housing['Area Income'],3,labels=['Low','Medium','High'])
housing['Area Population Category'] = pd.qcut(housing['Area Population'],3,labels=['Low','Medium','High'])
housing['House Size'] = pd.qcut(housing['Number of Rooms'],3,labels=['Small','Medium','Large'])


In [None]:
# we are trying to check regression based on Area Income and Price
# Devide data set in to 80% traing set and 20% test set
x_train, x_test, y_train, y_test = train_test_split(housing['Area Income'], housing['Price'], test_size = 0.2)
# here x contains area income value
# y contains price values

In [None]:
# intilizing model
model = LinearRegression()
# traing model using fit method where first parameter is
#  area income and appling reshare [Reshape is being applied to change it from pandas to NumPy, and finally into a vector. (Reshape transverses it from a single dimension matrix to a vertical shape.)]
#  and second is list of prices 
model.fit(np.array(x_train).reshape(-1,1), y_train)

# check accuracy of mode and error range

In [None]:
from sklearn.metrics import r2_score

# Make predictions on the TEST set (data the model hasn't seen)
y_pred = model.predict(np.array(x_test).reshape(-1, 1))

# Compare predictions vs actual values
score = r2_score(y_test, y_pred)

print(f"Model Accuracy (RÂ²): {score:.2f}")

In [None]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)

print(f"Average Error: ${mae:,.2f}")
# Example output: "Average Error: $45,000.00" 
# (Means your predictions are usually off by about $45k)

In [None]:
preds = model.predict([[y_test.iloc[0] ]])
print(preds)

# joblib exports the model for usage

In [None]:
joblib.dump(model, 'housing_model.pkl')
print("Model trained and saved as housing_model.pkl")