# House Price Prediction with Linear Regression


In [None]:
!pip install numpy pandas matplotlib seaborn plotly opendatasets --quiet

## Exploring the Data



In [None]:
dataset_url = 'https://github.com/JovianML/opendatasets/raw/master/data/house-prices-advanced-regression-techniques.zip'

In [None]:
from urllib.request import urlretrieve

In [None]:
urlretrieve(dataset_url, 'house-prices.zip')

In [None]:
from zipfile import ZipFile

In [None]:
with ZipFile('house-prices.zip') as f:
    f.extractall(path='house-prices')

In [None]:
import os

In [None]:
data_dir = 'house-prices'

In [None]:
os.listdir(data_dir)

In [None]:
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

In [None]:
train_csv_path = data_dir + '/train.csv'
train_csv_path

In [None]:
prices_df = pd.read_csv(train_csv_path)

In [None]:
prices_df

In [None]:
prices_df.info()

In [None]:
n_rows = prices_df.shape[0]
n_rows

In [None]:
n_cols = prices_df.shape[1]
n_cols

In [None]:
print('The dataset contains {} rows and {} columns.'.format(n_rows, n_cols))

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
sns.distplot(prices_df['SalePrice']);


In [None]:
fig = px.scatter(prices_df, x ='YearBuilt', y='SalePrice', title='YearBuilt vs SalePrice' )
fig.update_traces(marker_size=6)
fig.show()

In [None]:
fig = px.scatter(prices_df, x ='LotArea', y='SalePrice', title='LotArea vs SalePrice' )
fig.update_traces(marker_size=6)
fig.show()

In [None]:
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(prices_df.corr(), cmap='Reds', annot=True, ax=ax)

plt.title('Correlation Matrix');
plt.show()

In [None]:
prices_df.hist(figsize=(30,30), bins=35)

In [None]:
prices_df

In [None]:
# Identify the input columns (a list of column names)
input_cols = prices_df.columns[1:-1].tolist()
input_cols

In [None]:
# Identify the name of the target column (a single string, not a list)
target_col = prices_df.columns[-1]
target_col

In [None]:
print(list(input_cols))

In [None]:
len(input_cols)

In [None]:
print(target_col)

In [None]:
inputs_df = prices_df[input_cols].copy()

In [None]:
targets = prices_df[target_col]

In [None]:
inputs_df

In [None]:
targets

###  Numeric and Categorical Data



In [None]:
prices_df.info()

In [None]:
import numpy as np

In [None]:
numeric_cols = inputs_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [None]:
categorical_cols = inputs_df.select_dtypes(include='object').columns.tolist()

In [None]:
print(list(numeric_cols))

In [None]:
print(list(categorical_cols))

In [None]:
print(len(numeric_cols))
print(len(categorical_cols))

### Impute Numerical Data



In [None]:
missing_counts = inputs_df[numeric_cols].isna().sum().sort_values(ascending=False)
missing_counts[missing_counts > 0]

In [None]:
#fig = px.scatter(prices_df, x ='MasVnrArea', y='SalePrice' )
#fig.update_traces(marker_size=6)
#fig.show()

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
# 1. Create the imputer
imputer = SimpleImputer(strategy='mean')

In [None]:
# 2. Fit the imputer to the numeric colums
imputer.fit(inputs_df[numeric_cols])

In [None]:
list(imputer.statistics_)

In [None]:
# 3. Transform and replace the numeric columns
inputs_df[numeric_cols] = imputer.transform(inputs_df[numeric_cols])

In [None]:
missing_counts = inputs_df[numeric_cols].isna().sum().sort_values(ascending=False)
missing_counts[missing_counts > 0] # should be an empty list

### Scale Numerical Values



In [None]:
inputs_df[numeric_cols].describe().loc[['min', 'max']]

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Create the scaler
scaler = MinMaxScaler()

In [None]:
# Fit the scaler to the numeric columns
scaler.fit(inputs_df[numeric_cols])

In [None]:
# Transform and replace the numeric columns
inputs_df[numeric_cols] = scaler.transform(inputs_df[numeric_cols])

In [None]:
inputs_df[numeric_cols].describe().loc[['min', 'max']]

### Encode Categorical Columns



In [None]:
inputs_df[categorical_cols].nunique().sort_values(ascending=False)

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
# 1. Create the encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
# 2. Fit the encoder to the categorical colums
encoder.fit(inputs_df[categorical_cols])

In [None]:
# 3. Generate column names for each category
encoded_cols = list(encoder.get_feature_names(categorical_cols))
len(encoded_cols)

In [None]:
# 4. Transform and add new one-hot category columns
inputs_df[encoded_cols] = encoder.transform(inputs_df[categorical_cols])

In [None]:
encoder.categories_

In [None]:
inputs_df

### Training and Validation Set



In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs_df[numeric_cols + encoded_cols], 
                                                                        targets, 
                                                                        test_size=0.25, 
                                                                        random_state=42)

In [None]:
train_inputs

In [None]:
train_targets

In [None]:
val_inputs

In [None]:
val_targets

## Train a Linear Regression Model



In [None]:
from sklearn.linear_model import Ridge

In [None]:
# Create the model
model = Ridge()

In [None]:
# Fit the model using inputs and targets
model.fit(train_inputs, train_targets)

## Predictions 


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
train_preds = model.predict(train_inputs)

In [None]:
train_preds

In [None]:
train_rmse = mean_squared_error(train_targets, train_preds, squared=False)

In [None]:
print('The RMSE loss for the training set is $ {}.'.format(train_rmse))

In [None]:
train_targets.std()

In [None]:
val_preds = model.predict(val_inputs)

In [None]:
val_preds

In [None]:
val_rmse = mean_squared_error(val_targets, val_preds, squared=False)

In [None]:
print('The RMSE loss for the validation set is $ {}.'.format(val_rmse))

In [None]:
val_targets.std()

### Feature Importance



In [None]:
weights = model.coef_

In [None]:
weights_df = pd.DataFrame({
    'columns': train_inputs.columns,
    'weight': weights
}).sort_values('weight', ascending=False)

In [None]:
weights_df

### Making Predictions



In [None]:
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols].values)
    X_input = input_df[numeric_cols + encoded_cols]
    return model.predict(X_input)[0]

In [None]:
sample_input = { 'MSSubClass': 20, 'MSZoning': 'RL', 'LotFrontage': 77.0, 'LotArea': 9320,
 'Street': 'Pave', 'Alley': None, 'LotShape': 'IR1', 'LandContour': 'Lvl', 'Utilities': 'AllPub',
 'LotConfig': 'Inside', 'LandSlope': 'Gtl', 'Neighborhood': 'NAmes', 'Condition1': 'Norm', 'Condition2': 'Norm',
 'BldgType': '1Fam', 'HouseStyle': '1Story', 'OverallQual': 4, 'OverallCond': 5, 'YearBuilt': 1959,
 'YearRemodAdd': 1959, 'RoofStyle': 'Gable', 'RoofMatl': 'CompShg', 'Exterior1st': 'Plywood',
 'Exterior2nd': 'Plywood', 'MasVnrType': 'None','MasVnrArea': 0.0,'ExterQual': 'TA','ExterCond': 'TA',
 'Foundation': 'CBlock','BsmtQual': 'TA','BsmtCond': 'TA','BsmtExposure': 'No','BsmtFinType1': 'ALQ',
 'BsmtFinSF1': 569,'BsmtFinType2': 'Unf','BsmtFinSF2': 0,'BsmtUnfSF': 381,
 'TotalBsmtSF': 950,'Heating': 'GasA','HeatingQC': 'Fa','CentralAir': 'Y','Electrical': 'SBrkr', '1stFlrSF': 1225,
 '2ndFlrSF': 0, 'LowQualFinSF': 0, 'GrLivArea': 1225, 'BsmtFullBath': 1, 'BsmtHalfBath': 0, 'FullBath': 1,
 'HalfBath': 1, 'BedroomAbvGr': 3, 'KitchenAbvGr': 1,'KitchenQual': 'TA','TotRmsAbvGrd': 6,'Functional': 'Typ',
 'Fireplaces': 0,'FireplaceQu': np.nan,'GarageType': np.nan,'GarageYrBlt': np.nan,'GarageFinish': np.nan,'GarageCars': 0,
 'GarageArea': 0,'GarageQual': np.nan,'GarageCond': np.nan,'PavedDrive': 'Y', 'WoodDeckSF': 352, 'OpenPorchSF': 0,
 'EnclosedPorch': 0,'3SsnPorch': 0, 'ScreenPorch': 0, 'PoolArea': 0, 'PoolQC':'Ex' , 'Fence': np.nan, 'MiscFeature': 'Shed',
 'MiscVal': 400, 'MoSold': 1, 'YrSold': 2010, 'SaleType': 'WD', 'SaleCondition': 'Normal'}

In [None]:
prices_df.PoolQC.value_counts()

In [None]:
predicted_price = predict_input(sample_input)

In [None]:
print('The predicted sale price of the house is ${}'.format(predicted_price))

### Saving the model



In [None]:
import joblib

In [None]:
house_price_predictor = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_col': target_col,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols
}

In [None]:
joblib.dump(house_price_predictor, 'house_price_predictor.joblib')