Model Training


In [72]:
import pandas as pd

In [73]:
df = pd.read_csv('./data/test.csv')
df.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0


In [74]:
#dropping the parking, status and Type column cause it is of less use.
df=df.drop(labels=['Parking','Locality','Type'],axis=1)

In [75]:
## Independent and dependent features
X = df.drop(labels=['Price'],axis=1)
Y = df[['Price']]

In [76]:
Y

Unnamed: 0,Price
0,6500000
1,5000000
2,15500000
3,4200000
4,6200000
...,...
1254,55000000
1255,12500000
1256,17500000
1257,11500000


In [77]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [78]:
# Define the custom ranking for each ordinal variable
Furnishing_categories = ['Unfurnished', 'Semi-Furnished', 'Furnished']
Status_categories = ['Almost_ready', 'Ready_to_move']
Transaction_categories = ['Resale','New_Property']

In [79]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

In [80]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Furnishing_categories,Status_categories,Transaction_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [81]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [82]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train))
X_test=pd.DataFrame(preprocessor.transform(X_test))

In [83]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-0.604399,-0.835421,-1.51714,-0.426795,-1.300568,0.242681,-0.792758
1,-0.496998,-0.835421,-0.541519,2.64936,0.213039,0.242681,-0.792758
2,1.749709,1.285633,1.409722,4.17143,0.213039,0.242681,-0.792758
3,0.12999,0.225106,0.434101,-0.188501,-1.300568,0.242681,-0.792758
4,0.124185,0.225106,0.434101,-0.188501,0.213039,0.242681,-0.792758


In [84]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [85]:
regression=LinearRegression()
regression.fit(X_train,y_train)

LinearRegression()

In [86]:
regression.coef_

array([[ 4627765.05378046,   968848.03183872, 13659507.76004423,
         5095111.62505713,  -961033.47147693, -1478234.4688135 ,
         1645128.11575182]])

In [87]:
regression.intercept_

array([21256980.70374575])

In [88]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [89]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 17915833.648291383
MAE: 9867362.456601398
R2 score 56.09084472905928


Lasso
Model Training Performance
RMSE: 17915833.632627387
MAE: 9867362.215662863
R2 score 56.09084480583972


Ridge
Model Training Performance
RMSE: 17916676.420736738
MAE: 9865287.08065664
R2 score 56.086713601558635


Elasticnet
Model Training Performance
RMSE: 18607476.914484568
MAE: 9615336.964577595
R2 score 52.63516624645534




In [90]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']