In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv(r"/kaggle/input/usa-housing-listings/housing.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

# Before imputing the null values let's remove the unwanted features as these features does not play any role in the estimation of house rent

In [None]:
df=df.drop(["id","url","region_url","image_url","description"],axis=1)

With region name itself we can know in which state the house is located , so there is no use for this state column.
so we can remove this state column also.

In [None]:
df=df.drop(["state"],axis=1)

The most important feature is sqfeet,region,type,no of bedrooms and bathrooms
Other features also play a role for rental price but that is of less importance
But we can take the other features also for estimation, as all those combining together play some role in estimation.

### Now let's impute the null values with proper values of central tendencies

In [None]:
df['laundry_options'] = df['laundry_options'].fillna(df['laundry_options'].mode()[0])
df['parking_options'] = df['parking_options'].fillna(df['parking_options'].mode()[0])
df['lat'] = df['lat'].fillna(df['lat'].mean())
df['long'] = df['long'].fillna(df['long'].mean())

In [None]:
# Checking how many values present
df.laundry_options.value_counts()

In [None]:
# Checking how many values present
df.parking_options.value_counts()

In [None]:
# Checking how many values present
df.region.value_counts()

In [None]:
# Checking how many values present
df.type.value_counts()

In [None]:
# Checking how many values present
df.baths.value_counts()

In [None]:
# we can convert this float values to int
df["baths"]=df["baths"].astype("int")

In [None]:
# Checking how many values present
df.beds.value_counts()

### A house can have maximum of 4 bedrooms and above that are not the normal range
### so we can remove the entries above 4
### If there is maximum of 4 bedrooms then the bathroom will be maximum of 4
### So we can remove the values above 4

In [None]:
outlier1 = ((df["beds"]>4) | (df["baths"]>4))
print("There is {} outlier".format(df[outlier1]["beds"].count()))

In [None]:
df = df[~outlier1]

In [None]:
df.hist(figsize=(30,10))

### We can see there are "0" values present in the column of price and sq feet
### According to zoning regulations, the minimum square footage for a house is 120 square feet (single room house).
### so we can remove the entries which are below 120
### The sq feet for a big houses can be around 5,000 sqfeeet at the maximum
### so we can remove the values above 5,000
### so we can have the values above 100 for price and less than 10,000 as the minimum sq feet is 120 and maximum is 5,000.
### At the maximum a person can pay 10,000 dollars as rent, if above that means he may buy his own house and pay the EMI rather than paying rent monthly.

In [None]:
outlier2 = ((df["sqfeet"]<120) | (df["sqfeet"]>5000) | (df["price"]<100) | (df["price"]>10000))
print("There is {} outlier".format(df[outlier2]["cats_allowed"].count()))

In [None]:
df = df[~outlier2]

In [None]:
df.describe()

### we can see there are two columns named dogs_allowed & cats_allowed
### we can combine those make a single column

In [None]:
df=df.drop(["cats_allowed"],axis=1)
df.rename(columns = {'dogs_allowed':'pets_allowed'}, inplace = True) 

# Data Visualization

In [None]:
sns.countplot(df["type"])
fig = plt.gcf()
fig.set_size_inches(15,10)
plt.title('Which type of house is more')

In [None]:
sns.barplot(x=df["type"],y=df["price"])
fig = plt.gcf()
fig.set_size_inches(15,10)
plt.title('Which type of house has more price')

In [None]:
sns.barplot(x=df["type"],y=df["sqfeet"])
fig = plt.gcf()
fig.set_size_inches(15,10)
plt.title('Which type of house has more sqfeet')

In [None]:
sns.barplot(x=df["parking_options"],y=df["price"])
fig = plt.gcf()
fig.set_size_inches(15,10)
plt.title('Which type of parking option has more price')

## Label Encoding the categorical string values

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()
db=df

In [None]:
db["region"]=le.fit_transform(df["region"])
db["type"]=le.fit_transform(df["type"])
db["laundry_options"]=le.fit_transform(df["laundry_options"])
db["parking_options"]=le.fit_transform(df["parking_options"])

In [None]:
db.head()

In [None]:
x=db.drop(columns=["price"])
y=db["price"]

## Scatter plot to understand the relation

In [None]:
plt.figure(figsize=(20,30), facecolor='white')
plotnumber = 1

for column in x:
    if plotnumber<=16 :
        ax = plt.subplot(4,4,plotnumber)
        plt.scatter(x[column],y)
        plt.xlabel(column,fontsize=20)
        plt.ylabel('Price',fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
x_scaled=scalar.fit_transform(x)

In [None]:
x_scaled

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(x_scaled,i) for i in range(x_scaled.shape[1])]
vif["Features"] = x.columns
vif

## Also checking multicollinearity with heatmap

In [None]:
corrl = db.corr()
plt.figure(figsize=(20,20))
sns.heatmap(corrl, cbar=True, square= True,fmt='.1f', annot=True, annot_kws={'size':12}, cmap='twilight_shifted_r')

### From the above graph and the vif we can say that there is no multicollinearity in this dataset

# Splitting the dataset for train and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,test_size = 0.30,random_state=470)

# Importing the models for training the dataset

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [None]:
dtr = DecisionTreeRegressor()
ran = RandomForestRegressor(n_estimators=90)
lin = LinearRegression()

In [None]:
models = {"Decision tree" : dtr,
          "Random forest" : ran,
          "Linear Regression" : lin}
scores= { }

In [None]:
for key, value in models.items():    
    model = value
    model.fit(x_train, y_train)
    scores[key] = model.score(x_test, y_test)

In [None]:
scores_frame = pd.DataFrame(scores, index=["Accuracy Score"]).T
scores_frame.sort_values(by=["Accuracy Score"], axis=0 ,ascending=False, inplace=True)
scores_frame

## we can see that the Random Forest Regression is giving good results than other models

# Model Evaluation

In [None]:
y_pred=ran.predict(x_test)

In [None]:
from sklearn import metrics
print('R^2:',metrics.r2_score(y_test, y_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_pred))*(len(y_train)-1)/(len(y_train)-x_train.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# Conclusion

### By comparing the above results we can take the Random forest Regressor for the estimation of house rent, with Random forrest regressor we can estimate the house rent with 86% accuracy