# Model creation and training

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import joblib

In [2]:
data=pd.read_csv('archive/train_norm.csv',index_col=0)
#data=pd.read_csv('archive/train_cleaned.csv',index_col=0)

In [3]:
data.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,0.0,0,0,0.052632,0.006533,1,1,0.495849,0.671148,55.0
1,0.5,0,0,0.052632,0.006406,1,1,0.489136,0.667939,51.0
2,0.0,0,0,0.052632,0.004685,1,1,0.493996,0.671263,43.0
3,0.0,0,1,0.052632,0.004668,1,1,0.647126,0.670295,62.5
4,0.5,1,0,0.052632,0.005016,0,1,0.588728,0.707792,60.5


In [4]:
y=data['TARGET(PRICE_IN_LACS)']
X=data.drop(['TARGET(PRICE_IN_LACS)'],axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
score_dict={}
def score_dict_add(score_dict,model,mse,mae,r2,y_pred):
    score_dict[model]={"R2 Score":r2,
                       "Mean Squared Error": mse,
                       "Mean Absolute Error": mae,
                       "y_pred":y_pred}
    return score_dict

## Random Forest Regressor

In [None]:
# Creating the Random Forest regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Training the model
rf_regressor.fit(X_train, y_train)

# Making predictions on the test set
y_pred = rf_regressor.predict(X_test)

# Evaluating the model's performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

score_dict=score_dict_add(score_dict,"Random Forest Regression",mse,mae,r2,y_pred)

## XGB Regressor

In [None]:
# Creating the XGBoost regressor
xgb_regressor = xgb.XGBRegressor(n_estimators=100, random_state=42)

# Training the model
xgb_regressor.fit(X_train, y_train)

# Making predictions on the test set
y_pred = xgb_regressor.predict(X_test)

# Evaluating the model's performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

score_dict=score_dict_add(score_dict,"XGBoost Regression",mse,mae,r2,y_pred)


## Support Vector Regression

In [None]:
# Creating the SVR model
svr_regressor = SVR()

# Training the model
svr_regressor.fit(X_train, y_train)

# Making predictions on the test set
y_pred = svr_regressor.predict(X_test)

# Evaluating the model's performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

score_dict=score_dict_add(score_dict,"Support Vector Regression",mse,mae,r2,y_pred)

## Linear Regression

In [None]:
# Creating the Linear Regression model
linear_regressor = LinearRegression()

# Training the model
linear_regressor.fit(X_train, y_train)

# Making predictions on the test set
y_pred = linear_regressor.predict(X_test)

# Evaluating the model's performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

score_dict=score_dict_add(score_dict,"Linear Regression",mse,mae,r2,y_pred)

## Ridge Regression

In [None]:
# Creating the Ridge Regression model
ridge_regressor = Ridge(alpha=0.5)

# Training the model
ridge_regressor.fit(X_train, y_train)

# Making predictions on the test set
y_pred = ridge_regressor.predict(X_test)

# Evaluating the model's performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

score_dict=score_dict_add(score_dict,"Ridge Regression",mse,mae,r2,y_pred)

## KNN regressor

In [None]:
# Creating the KNN regressor
k = 5  # Number of neighbors to consider
knn_regressor = KNeighborsRegressor(n_neighbors=k)

# Training the model
knn_regressor.fit(X_train, y_train)

# Making predictions on the test set
y_pred = knn_regressor.predict(X_test)

# Evaluating the model's performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

score_dict=score_dict_add(score_dict,"KNN Regression",mse,mae,r2,y_pred)

# Auto ML - Hyperparameter tuning

## Random Forest Regressor

In [5]:
# Create the RandomForestRegressor model
rf_regressor = RandomForestRegressor(random_state=42)

# Define the hyperparameters and their ranges to tune
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at each split
    'max_depth': [None, 5, 10, 20],  # Maximum depth of the tree
}

# Perform grid search using cross-validation
grid_search = GridSearchCV(rf_regressor, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)  # X and y are your training data and labels

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': [None, 5, 10, 20],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [50, 100, 150]},
             scoring='neg_mean_squared_error')

In [7]:
# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Train the model with the best hyperparameters
best_model.fit(X_train, y_train)  # X and y are your training data and labels

# Save the best model to a file
joblib.dump(best_model, 'models/RFR.pkl')

['models/RFR.pkl']

## MLP Regressor

In [None]:
# Create the MLPRegressor model
model = MLPRegressor(random_state=42)

param_grid = {
    'hidden_layer_sizes': [(np.random.randint(64, 100),), 
                           (np.random.randint(64, 100),np.random.randint(64, 100)), 
                           (np.random.randint(64, 100),np.random.randint(64, 100),np.random.randint(64, 100))],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'lbfgs'],
    'learning_rate':['constant', 'adaptive'],
    'max_iter': [10000],
    'batch_size': [512]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X, y)

In [None]:
# Get the best hyperparameters and model
best_params_nn = grid_search.best_params_
best_model_nn = grid_search.best_estimator_

# Train the model with the best hyperparameters
best_model_nn.fit(X_train, y_train)  # X and y are your training data and labels

# Save the best model to a file
joblib.dump(best_model_nn, 'models/NN.pkl')

## Make Prediction

In [None]:
# Make predictions on the test data
RFR = joblib.load('models/RFR.pkl')
y_pred = RFR.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

score_dict=score_dict_add(score_dict,"RFR_hyperparam",mse,mae,r2,y_pred)

In [None]:
# Make predictions on the test data
NN = joblib.load('models/NN.pkl')
y_pred = NN.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

score_dict=score_dict_add(score_dict,"MLPRegressor",mse,mae,r2,y_pred)

# Results

In [None]:
score_pd=pd.DataFrame(score_dict).transpose().sort_values('Mean Absolute Error')
score_pd.iloc[:,0:3]
score_pd.to_csv('results.csv')

In [None]:
import seaborn as sns 
from matplotlib import pyplot as plt

fig,ax=plt.subplots(3,1,figsize=(18,12))
sns.lineplot(data=score_pd.iloc[:,[0]],markers=True,ax=ax[0])
sns.lineplot(data=score_pd.iloc[:,[2]],markers=True,ax=ax[2])
sns.lineplot(data=score_pd.iloc[:,[1]],markers=True,ax=ax[1])
plt.tight_layout()
plt.show()

In [None]:
y_pred=score_dict['Random Forest Regression']['y_pred']

# Create the figure and axes objects
fig, ax = plt.subplots(figsize=(24, 10))

# Plot the original prices and predictions
sns.lineplot(y=y_test.values.ravel(), x=y_test.index, legend=False, ax=ax)
sns.lineplot(y=y_pred, x=y_test.index, legend=False, ax=ax)

# Set the title and legend
ax.set_title('Original Price vs Prediction')
ax.legend(['Original', 'Prediction'])

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
"""model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1)
])

# Compiling the model
model.compile(optimizer='adam', loss='mean_squared_error')"""

In [None]:
"""# Training the model
model.fit(X_train, y_train, epochs=100, batch_size=32)"""

In [None]:
"""# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model's performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Neural Network Regression:")
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)"""