## Perform all Regression Algorithms

#### Import all Libraries

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

#### Insert dataset

In [3]:
x_monthly = pd.read_csv(r'C:\House_price\notebook\data\x_monthly_data.csv', index_col='date')

x_monthly.index = pd.to_datetime(x_monthly.index).normalize()
x_monthly.head()

Unnamed: 0_level_0,spend,permits,permit_val,starts,completions,manufactured,new_for_sale,months_supply,emratio,pop_level,...,pm_save,consump_durable,new_sold,hp_idx,rent_vacancy,owner_vacancy,week_earning,delinquent_rate,hor,hp_idx_qtr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,353065,1277,9982.312,1268,1262.0,304,310,4.3,64.6,211410,...,358.9,908.6,873,100.551,7.9,1.6,603.0,1.95,67.1,101.338667
2000-02-01,351933,1241,11363.143,1255,1326.0,291,305,4.3,64.6,211576,...,324.3,930.7,856,101.339,7.933333,1.566667,604.0,1.96,67.133333,102.117222
2000-03-01,353452,1253,15373.755,1313,1312.0,287,310,4.3,64.6,211772,...,311.8,923.3,900,102.126,7.966667,1.533333,605.0,1.97,67.166667,102.895778
2000-04-01,356188,1192,14128.141,1275,1307.0,271,299,4.4,64.7,212018,...,347.8,900.6,841,102.922,8.0,1.5,606.0,1.98,67.2,103.674333
2000-05-01,349907,1182,15639.629,1230,1334.0,265,302,4.4,64.4,212242,...,351.1,907.0,857,103.677,8.066667,1.533333,607.666667,2.016667,67.366667,104.378333


In [4]:
# Truncate the 'x_monthly' DataFrame to include data only up to December 1, 2019, and reset the index.
x_monthly = x_monthly[:'2019-12-1'].reset_index()

# Create the 'y_monthly' variable by selecting the 'hp_idx' column.
y_monthly = x_monthly['hp_idx']

# Drop the 'date' and target variables ('hp_idx' and 'hp_idx_qtr') from 'x_monthly'.
x_monthly.drop(['date', 'hp_idx', 'hp_idx_qtr'], axis=1, inplace=True)

# Display the first few rows of the modified 'x_monthly' DataFrame.
x_monthly.head()

Unnamed: 0,spend,permits,permit_val,starts,completions,manufactured,new_for_sale,months_supply,emratio,pop_level,...,fed_fund_rate,disp_income,pm_save,consump_durable,new_sold,rent_vacancy,owner_vacancy,week_earning,delinquent_rate,hor
0,353065,1277,9982.312,1268,1262.0,304,310,4.3,64.6,211410,...,3.99,9309.1,358.9,908.6,873,7.9,1.6,603.0,1.95,67.1
1,351933,1241,11363.143,1255,1326.0,291,305,4.3,64.6,211576,...,5.79,9345.2,324.3,930.7,856,7.933333,1.566667,604.0,1.96,67.133333
2,353452,1253,15373.755,1313,1312.0,287,310,4.3,64.6,211772,...,5.78,9370.3,311.8,923.3,900,7.966667,1.533333,605.0,1.97,67.166667
3,356188,1192,14128.141,1275,1307.0,271,299,4.4,64.7,212018,...,6.17,9418.3,347.8,900.6,841,8.0,1.5,606.0,1.98,67.2
4,349907,1182,15639.629,1230,1334.0,265,302,4.4,64.4,212242,...,6.17,9457.3,351.1,907.0,857,8.066667,1.533333,607.666667,2.016667,67.366667


#### Spliting the dataset

In [5]:
# Split the 'x_monthly' and 'y_monthly' datasets into training and testing sets.
# 'test_size=0.20' indicates that 20% of the data will be used for testing, while 80% will be used for training.
# 'shuffle=False' ensures that the data is not randomly shuffled before splitting, maintaining the chronological order.
# 'stratify=None' means that there's no stratification based on any specific variable.
xm_train, xm_test, ym_train, ym_test = train_test_split(x_monthly, y_monthly, test_size=0.20, shuffle=False, stratify=None,random_state=22)

#### Shape of the spliting data

In [6]:
print(xm_train.shape)
print(xm_test.shape)
print(ym_train.shape)
print(ym_test.shape)

(201, 22)
(51, 22)
(201,)
(51,)


#### Scaling the all Input features

In [7]:
## scale data

scaler = StandardScaler()
scaler.fit(xm_train)

xm_train_sc = scaler.transform(xm_train)
xm_test_sc = scaler.transform(xm_test)

In [8]:
print(xm_train_sc.shape)
print(xm_test_sc.shape)

(201, 22)
(51, 22)


#### Convert the numpy data to DataFrame

In [9]:
xm_train_df = pd.DataFrame(xm_train_sc, columns=xm_train.columns)

xm_test_df = pd.DataFrame(xm_test_sc, columns=xm_test.columns)

#### Evaluating the Model

In [10]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

#### To Create the all algorithms in Dict format

In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
"Random Forest Regressor": RandomForestRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(xm_train_df, ym_train) # Train model

    # Make predictions
    y_train_pred = model.predict(xm_train_df)
    y_test_pred = model.predict(xm_test_df)
    
    # Evaluate Train and Test dataset
    model_train_mae ,model_train_mse, model_train_rmse, model_train_r2 = evaluate_model(ym_train, y_train_pred)

    model_test_mae ,model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(ym_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Squared Error: {:.4f}".format(model_train_mse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Squared Error: {:.4f}".format(model_test_mse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 1.3506
- Mean Squared Error: 1.8241
- Mean Absolute Error: 1.0694
- R2 Score: 0.9967
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4.4768
- Mean Squared Error: 20.0415
- Mean Absolute Error: 4.0844
- R2 Score: 0.8581


Lasso
Model performance for Training set
- Root Mean Squared Error: 3.3720
- Mean Squared Error: 11.3707
- Mean Absolute Error: 2.8058
- R2 Score: 0.9795
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 8.1884
- Mean Squared Error: 67.0501
- Mean Absolute Error: 6.3337
- R2 Score: 0.5253


Ridge
Model performance for Training set
- Root Mean Squared Error: 1.5652
- Mean Squared Error: 2.4498
- Mean Absolute Error: 1.2526
- R2 Score: 0.9956
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.0535
- Mean Squared Error: 25.5384
- Mean Absolute Error: 4.6123
- R2 S

K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 1.0836
- Mean Squared Error: 1.1743
- Mean Absolute Error: 0.6988
- R2 Score: 0.9979
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 25.6052
- Mean Squared Error: 655.6267
- Mean Absolute Error: 22.7805
- R2 Score: -3.6420


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 29.1826
- Mean Squared Error: 851.6221
- Mean Absolute Error: 27.7199
- R2 Score: -5.0297


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 0.5694
- Mean Squared Error: 0.3243
- Mean Absolute Error: 0.3734
- R2 Score: 0.9994
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 29.5944
- Mean Squared Error: 875.82

Insights
* The above all algorithms are performing only Linear regression will be given to high R2 score
- Training set R2 Score: 0.9979
- Test set R2 Score: 0.855
- I am Improve the Test set R2 score then remove the some unimportant features.

#### Less useful features removed

In [12]:
# Function to drop specified columns from a DataFrame
def drop_cols(cols, df):
    # Drop the specified columns from the DataFrame 'df'
    df.drop(cols, axis=1, inplace=True)

In [13]:
# Drop multiple features from the DataFrame 'xm_train_df'
cols_to_drop = ['completions', 'manufactured', 'rent_vacancy','permit_val','months_supply','new_sold','mortgage_rate']
drop_cols(cols_to_drop, xm_train_df)
drop_cols(cols_to_drop,xm_test_df)

In [14]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [15]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
"Random Forest Regressor": RandomForestRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(xm_train_df, ym_train) # Train model

    # Make predictions
    y_train_pred = model.predict(xm_train_df)
    y_test_pred = model.predict(xm_test_df)
    
    # Evaluate Train and Test dataset
    model_train_mae ,model_train_mse, model_train_rmse, model_train_r2 = evaluate_model(ym_train, y_train_pred)

    model_test_mae ,model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(ym_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Squared Error: {:.4f}".format(model_train_mse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Squared Error: {:.4f}".format(model_test_mse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 1.3764
- Mean Squared Error: 1.8945
- Mean Absolute Error: 1.1056
- R2 Score: 0.9966
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4.1116
- Mean Squared Error: 16.9056
- Mean Absolute Error: 3.9787
- R2 Score: 0.8803


Lasso
Model performance for Training set
- Root Mean Squared Error: 3.3413
- Mean Squared Error: 11.1643
- Mean Absolute Error: 2.8005
- R2 Score: 0.9799
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 7.4870
- Mean Squared Error: 56.0548
- Mean Absolute Error: 5.5721
- R2 Score: 0.6031


Ridge
Model performance for Training set
- Root Mean Squared Error: 1.6040
- Mean Squared Error: 2.5728
- Mean Absolute Error: 1.2728
- R2 Score: 0.9954
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.2111
- Mean Squared Error: 27.1558
- Mean Absolute Error: 4.8330
- R2 S

K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.9117
- Mean Squared Error: 0.8312
- Mean Absolute Error: 0.5930
- R2 Score: 0.9985
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 25.5554
- Mean Squared Error: 653.0780
- Mean Absolute Error: 22.6239
- R2 Score: -3.6239


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 29.0161
- Mean Squared Error: 841.9315
- Mean Absolute Error: 27.5764
- R2 Score: -4.9610


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 0.5607
- Mean Squared Error: 0.3144
- Mean Absolute Error: 0.3737
- R2 Score: 0.9994
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 29.7282
- Mean Squared Error: 883.76

Insights
- In the above before removing the some features there is 15% difference between the Training and Testing scores
- But we remove the some unvanted or less value features it will incresed by 3% in Testing set.
- Training set R2 Score: 0.9979
- Test set R2 Score: 0.888