# Workshop 3
## Analyzing the Housing data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#read data from csv
df = pd.read_csv('Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
df = df.dropna()

In [4]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
columns_to_encode = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

In [5]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0


In [21]:
# make lasso model to predict price
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

#define x, y
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [22]:
lasso_model = Lasso(alpha=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# fit the model 
lasso_model.fit(X_train, y_train)

In [23]:
# evaluate model
scores = cross_val_score(lasso_model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))


Mean MAE: 798953.515 (94229.453)


In [24]:
from sklearn.metrics import mean_squared_error, r2_score

#predict
y_pred = lasso_model.predict(X_test)

#check accuracy
print("Mean squared error : ", mean_squared_error(y_test, y_pred))
print("R2 score : ", r2_score(y_test, y_pred))

#use .coef_ and .intercept_ to access the parameters of the model. What can you say about them?
print("Intercept : ", lasso_model.intercept_)
print("Coefficients : ", lasso_model.coef_)

Mean squared error :  1356253300445.087
R2 score :  0.6792966438087151
Intercept :  255842.44306381233
Coefficients :  [ 2.47758224e+02  6.54662800e+04  9.11170860e+05  4.83887978e+05
  4.57472700e+05  4.20292259e+05  4.87342908e+05  1.12663983e+06
  8.74189741e+05  2.37743557e+05  4.83374082e+05 -2.11672208e+05]


In [25]:
import pickle
with open("Lasso_model.pkl", "wb") as file:
    pickle.dump(lasso_model, file)

print("Model trained and saved successfully.")

Model trained and saved successfully.
