In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import math
import plotly.express as px

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df1 = pd.read_csv('../input/house-rent-prices-of-metropolitan-cities-in-india/Ahmedabad_rent.csv')
df2 = pd.read_csv('../input/house-rent-prices-of-metropolitan-cities-in-india/Bangalore_rent.csv')
df3 = pd.read_csv('../input/house-rent-prices-of-metropolitan-cities-in-india/Chennai_rent.csv')
df4 = pd.read_csv('../input/house-rent-prices-of-metropolitan-cities-in-india/Delhi_rent.csv')
df5 = pd.read_csv('../input/house-rent-prices-of-metropolitan-cities-in-india/Hyderabad_rent.csv')
df6 = pd.read_csv('../input/house-rent-prices-of-metropolitan-cities-in-india/Kolkata_rent.csv')
df7 = pd.read_csv('../input/house-rent-prices-of-metropolitan-cities-in-india/Mumbai_rent.csv')
df8 = pd.read_csv('../input/house-rent-prices-of-metropolitan-cities-in-india/Pune_rent.csv')

In [None]:
dfs=[df1,df2,df3,df4,df5,df6,df7,df8]
cities= ['Ahmedabad','Bangalore','Chennai','Delhi','Hyderabad','Kolkata','Mumbai','Pune']

for df,city in zip(dfs,cities):
    df['city']=city
    print(city,df.shape)

In [None]:
merged = pd.concat(dfs)
merged.head()

In [None]:
merged.info()

In [None]:
merged=merged.replace('', np.nan)
merged=merged.dropna()
merged.head()

In [None]:
merged.bathroom.value_counts()

In [None]:
merged = merged[merged.bathroom.str.contains('bathrooms')]

In [None]:
merged['check'] = merged['price'].apply(lambda x : '0' if ',' in x else '1')
merged.head(3)

In [None]:
merged['price'] = merged['price'].str.replace(r'[^\d.]+', '', regex=True)
merged['bathroom'] = merged['bathroom'].str.replace(r'[^\d.]+', '', regex=True)
merged.head()

In [None]:
merged=merged.replace('', np.nan)
merged=merged.dropna()
merged.shape

In [None]:
merged['bathroom'] = merged['bathroom'].astype('float64', errors = 'raise')
merged['price'] = merged['price'].astype('float64', errors = 'raise')

In [None]:
merged.loc[merged.check.str.contains('1'), 'price'] *= 100000.0
merged.drop('check',axis=1,inplace=True)
merged.head()

In [None]:
merged.describe()

In [None]:
merged.bathroom.value_counts()

In [None]:
merged=merged.replace('', np.nan)
merged=merged.dropna()
merged.shape

In [None]:
merged = merged[(merged['area'] > 200)]

In [None]:
# blr=merged[merged[city=="Bangalore"]]

In [None]:
# Removing the outliers using Interquartile Range for all columns

def removeOutliers(data, col):
    Q3 = np.quantile(data[col], 0.75)
    Q1 = np.quantile(data[col], 0.25)
    IQR = Q3 - Q1
      
    print("IQR value for column %s is: %s" % (col, IQR))
    global outlier_free_list
    global filtered_data
      
    lower_range = Q1 - 1.5 * IQR
    upper_range = Q3 + 1.5 * IQR
    outlier_free_list = [x for x in data[col] if (
        (x > lower_range) & (x < upper_range))]
    filtered_data = data.loc[data[col].isin(outlier_free_list)]

out_columns = merged[['price', 'area']]  
for i in out_columns:
    removeOutliers(merged, i)
  
# Assigning filtered data back to our original variable'

merged = filtered_data
print("Shape of data after outlier removal is: ", merged.shape)

In [None]:
merged.describe()

In [None]:
merged1=merged.copy()

In [None]:
# merged1=merged1[merged1['price']<80000]

In [None]:
figure = plt.figure(figsize=(15,8))
sns.boxplot(x='city',y='price',data=merged1)

In [None]:
figure = plt.figure(figsize=(8,8))
sns.boxplot(x='seller_type',y='price',data=merged1)

In [None]:
figure = plt.figure(figsize=(8,8))
sns.boxplot(x='furnish_type',y='price',data=merged1)

In [None]:
plt.figure(figsize=(20,10))
sns.catplot(x="bedroom", y="price", data=merged1)

In [None]:
sns.scatterplot(x="area", y="price", data=merged1, marker="P", hue="city")
plt.gcf().set_size_inches(20,10)
plt.show()

In [None]:
rent=merged1.copy()

In [None]:
sns.heatmap(rent.corr(), annot=True, cmap="RdBu")
plt.show()

In [None]:
rent=rent.drop(['locality'],axis=1)

In [None]:
rent.head()

In [None]:
rent=pd.get_dummies(rent,columns=['seller_type','layout_type', 'property_type','furnish_type', 'city'],drop_first=True)
rent.head()

In [None]:
from sklearn.model_selection import train_test_split

X=rent.drop(columns=['price'],axis=1)
y=rent['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
print("x train: ",X_train.shape)
print("x test: ",X_test.shape)
print("y train: ",y_train.shape)
print("y test: ",y_test.shape)

In [None]:
from statsmodels.api import OLS

model= OLS(y_train, X_train).fit()
print(model.summary())

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn import metrics

CV = []
R2_train = []
R2_test = []

def rent_pred_model(model,model_name):
    # Training model
    model.fit(X_train,y_train)
            
    # R2 score of train set
    y_pred_train = model.predict(X_train)
    R2_train_model = r2_score(y_train,y_pred_train)
    R2_train.append(round(R2_train_model,2))
    
    # R2 score of test set
    y_pred_test = model.predict(X_test)
    R2_test_model = r2_score(y_test,y_pred_test)
    R2_test.append(round(R2_test_model,2))
    
    # R2 mean of train set using Cross validation
    cross_val = cross_val_score(model ,X_train ,y_train ,cv=3)
    cv_mean = cross_val.mean()
    CV.append(round(cv_mean,2))
    
    # MAE
    mae = metrics.mean_absolute_error(y_test,y_pred_test)
    
    # MSE
    rmse = math.sqrt(metrics.mean_squared_error(y_test,y_pred_test))
    
    
    # Printing results
    print("Train R2-score :",round(R2_train_model,2))
    print("Test R2-score :",round(R2_test_model,2))
    print("Train CV scores :",cross_val)
    print("Train CV mean :",round(cv_mean,2))
    print("MAE :", round(mae,5))
    print("RMSE :", round(rmse,5))
    
    # Plotting Graphs 
    # Residual Plot of train data
    fig, ax = plt.subplots(1,2,figsize = (10,4))
    ax[0].set_title('Residual Plot of Train samples')
    sns.distplot((y_train-y_pred_train),hist = False,ax = ax[0])
    ax[0].set_xlabel('y_train - y_pred_train')
    
    # Y_test vs Y_train scatter plot
    ax[1].set_title('y_test vs y_pred_test')
    ax[1].scatter(x = y_test, y = y_pred_test)
    ax[1].set_xlabel('y_test')
    ax[1].set_ylabel('y_pred_test')
    
    plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
rent_pred_model(lr,"Linear_regressor.pkl")

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV

# Creating Ridge model object
rg = Ridge()
# range of alpha 
alpha = np.logspace(-3,3,num=14)

# Creating RandomizedSearchCV to find the best estimator of hyperparameter
rg_rs = RandomizedSearchCV(estimator = rg, param_distributions = dict(alpha=alpha))

rent_pred_model(rg_rs,"ridge.pkl")

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import RandomizedSearchCV

ls = Lasso()
alpha = np.logspace(-3,3,num=14) # range for alpha

ls_rs = RandomizedSearchCV(estimator = ls, param_distributions = dict(alpha=alpha))

rent_pred_model(ls_rs,"lasso.pkl")