<a href="https://colab.research.google.com/github/skyanalyst/Machine-Learning-Projects/blob/main/house_price_prediction_with_linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install opendatasets --quiet
!pip install scikit-learn --quiet

In [31]:
import opendatasets as od 
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder 

In [32]:
od.download('https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data')

Skipping, found downloaded files in "./house-prices-advanced-regression-techniques" (use force=True to force download)


In [33]:
raw_df =pd.read_csv('/content/house-prices-advanced-regression-techniques/train.csv')


In [35]:
# Create inputs and Targets
input_cols = list(raw_df.columns)[1:-1]
target_col = 'SalePrice'
inputs_df = raw_df[input_cols].copy()
targets = raw_df[target_col].copy()

In [36]:
# Identify Numeric and Categorical column stack
numeric_cols = inputs_df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = inputs_df.select_dtypes('object').columns.tolist()

In [37]:
# MIssing_counts
missing_counts = inputs_df[numeric_cols].isna().sum().sort_values(ascending=False)
missing_counts[missing_counts>0]

LotFrontage    259
GarageYrBlt     81
MasVnrArea       8
dtype: int64

In [38]:
# Impute Numerical Data 
imputer = SimpleImputer(strategy = 'mean').fit(raw_df[numeric_cols])
inputs_df[numeric_cols]= imputer.transform(inputs_df[numeric_cols])

In [41]:
# Scale Numeric Value
inputs_df[numeric_cols].describe().loc[['min', 'max']]

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
# Scaling Numeric_values
scaler = MinMaxScaler().fit(raw_df[numeric_cols])
inputs_df[numeric_cols]= scaler.transform(inputs_df[numeric_cols])

In [45]:
# Encode Categorical_columns
inputs_df[categorical_cols].nunique().sort_values(ascending=False)

Neighborhood     25
Exterior2nd      16
Exterior1st      15
SaleType          9
Condition1        9
Condition2        8
HouseStyle        8
RoofMatl          8
Functional        7
BsmtFinType2      6
Heating           6
RoofStyle         6
SaleCondition     6
BsmtFinType1      6
GarageType        6
Foundation        6
Electrical        5
FireplaceQu       5
HeatingQC         5
GarageQual        5
GarageCond        5
MSZoning          5
LotConfig         5
ExterCond         5
BldgType          5
BsmtExposure      4
MiscFeature       4
Fence             4
LotShape          4
LandContour       4
BsmtCond          4
KitchenQual       4
MasVnrType        4
ExterQual         4
BsmtQual          4
LandSlope         3
GarageFinish      3
PavedDrive        3
PoolQC            3
Utilities         2
CentralAir        2
Street            2
Alley             2
dtype: int64

In [46]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(raw_df[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
inputs_df[encoded_cols]= encoder.transform(inputs_df[categorical_cols])

  self[col] = igetitem(value, i)


In [49]:
from scipy.sparse import random
# Training and Validation set
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs_df[numeric_cols + encoded_cols],
                                                                        targets,
                                                                        test_size=0.25,
                                                                        random_state=42)

In [53]:
# Train a Linear Regression Model
from sklearn.linear_model import Ridge
model = Ridge(solver='svd')
model.fit(train_inputs, train_targets)

Ridge(solver='svd')

In [54]:
# Make Prediction and Evaluate your Model
from sklearn.metrics import mean_squared_error

train_preds =model.predict(train_inputs)
train_preds

array([172549.49239604, 176648.3841514 , 104461.18939205, ...,
       121549.23101908, 173504.31921626, 190778.41334452])

In [59]:
train_rmse = mean_squared_error(train_targets, train_preds, squared=False)
train_rmse

21877.850450615537

In [61]:
val_preds = model.predict(val_inputs)
val_rmse = mean_squared_error(val_targets, val_preds, squared=False)
val_rmse

29009.302517872115

In [64]:
weights = model.coef_
weights_df = pd.DataFrame({
    'columns': train_inputs.columns,
    'weight' : weights
}).sort_values('weight', ascending=False)
weights_df

Unnamed: 0,columns,weight
275,PoolQC_Ex,77483.612175
15,GrLivArea,74388.503609
13,2ndFlrSF,62433.996447
3,OverallQual,62055.123269
12,1stFlrSF,60726.614752
...,...,...
266,GarageCond_Ex,-21028.480031
21,KitchenAbvGr,-26474.353422
277,PoolQC_Gd,-72681.458501
102,Condition2_PosN,-84704.714012


In [65]:
# Making Prediction
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols].values)
    X_input = input_df[numeric_cols + encoded_cols]
    return model.predict(X_input)[0]

In [66]:
sample_input = { 'MSSubClass': 20, 'MSZoning': 'RL', 'LotFrontage': 77.0, 'LotArea': 9320,
 'Street': 'Pave', 'Alley': None, 'LotShape': 'IR1', 'LandContour': 'Lvl', 'Utilities': 'AllPub',
 'LotConfig': 'Inside', 'LandSlope': 'Gtl', 'Neighborhood': 'NAmes', 'Condition1': 'Norm', 'Condition2': 'Norm',
 'BldgType': '1Fam', 'HouseStyle': '1Story', 'OverallQual': 4, 'OverallCond': 5, 'YearBuilt': 1959,
 'YearRemodAdd': 1959, 'RoofStyle': 'Gable', 'RoofMatl': 'CompShg', 'Exterior1st': 'Plywood',
 'Exterior2nd': 'Plywood', 'MasVnrType': 'None','MasVnrArea': 0.0,'ExterQual': 'TA','ExterCond': 'TA',
 'Foundation': 'CBlock','BsmtQual': 'TA','BsmtCond': 'TA','BsmtExposure': 'No','BsmtFinType1': 'ALQ',
 'BsmtFinSF1': 569,'BsmtFinType2': 'Unf','BsmtFinSF2': 0,'BsmtUnfSF': 381,
 'TotalBsmtSF': 950,'Heating': 'GasA','HeatingQC': 'Fa','CentralAir': 'Y','Electrical': 'SBrkr', '1stFlrSF': 1225,
 '2ndFlrSF': 0, 'LowQualFinSF': 0, 'GrLivArea': 1225, 'BsmtFullBath': 1, 'BsmtHalfBath': 0, 'FullBath': 1,
 'HalfBath': 1, 'BedroomAbvGr': 3, 'KitchenAbvGr': 1,'KitchenQual': 'TA','TotRmsAbvGrd': 6,'Functional': 'Typ',
 'Fireplaces': 0,'FireplaceQu': np.nan,'GarageType': np.nan,'GarageYrBlt': np.nan,'GarageFinish': np.nan,'GarageCars': 0,
 'GarageArea': 0,'GarageQual': np.nan,'GarageCond': np.nan,'PavedDrive': 'Y', 'WoodDeckSF': 352, 'OpenPorchSF': 0,
 'EnclosedPorch': 0,'3SsnPorch': 0, 'ScreenPorch': 0, 'PoolArea': 0, 'PoolQC': np.nan, 'Fence': np.nan, 'MiscFeature': 'Shed',
 'MiscVal': 400, 'MoSold': 1, 'YrSold': 2010, 'SaleType': 'WD', 'SaleCondition': 'Normal'}

In [67]:
predicted_price = predict_input(sample_input)
predicted_price

  self[col] = igetitem(value, i)


123884.48449778001

In [68]:
import joblib

In [69]:
house_price_predictor = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_col': target_col,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols
}

In [70]:
joblib.dump(house_price_predictor, 'house_price_predictor.joblib')

['house_price_predictor.joblib']