In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv("../input/brasilian-houses-to-rent/houses_to_rent_v2.csv")
data.head()

In [None]:
data.info()

# EDA :
 I am doing some analysis of data before building the Linear regression model.
 
 let's start with data cleaning and preparation step by step.

In [None]:
# Convert datatype of floor into numeric value.
data['floor'] = pd.to_numeric(data['floor'], errors = 'coerce')
data.info()

In [None]:
data.isnull().sum()

In [None]:
# in floor I am replacing the null value with 0 as a ground floor.
data = data.fillna(0)
print(len(data))
print(data.describe())

In [None]:
# check the outliers in rooms, bathroom,floor
import seaborn as sns
import matplotlib.pyplot as plot

f,axes = plot.subplots(1,3)

sns.boxplot(y = 'floor', data = data, ax=axes[0])
sns.boxplot(y = 'rooms', data = data, ax = axes[1])
sns.boxplot(y = 'bathroom', data = data, ax = axes[2])


In [None]:
# outlier treatment:

q1 = data.floor.quantile(0.25)
q3 = data.floor.quantile(0.75)
IQR = q3-q1
data_1 = data[(data.floor >= q1-1.5*IQR) & (data.floor <= q3 + 1.5 * IQR)]

q1 = data_1.rooms.quantile(0.25)
q3 = data_1.rooms.quantile(0.75)
IQR = q3-q1
data_1 = data_1[(data_1.rooms >= q1-1.5*IQR) & (data_1.rooms <= q3 + 1.5 * IQR)]

q1 = data_1.bathroom.quantile(0.25)
q3 = data_1.bathroom.quantile(0.75)
IQR = q3-q1
data_1 = data_1[(data_1.bathroom >= q1-1.5*IQR) & (data_1.bathroom <= q3 + 1.5 * IQR)]

print(len(data_1))

f,axes = plot.subplots(1,3)

sns.boxplot(y = 'floor', data = data_1, ax=axes[0])
sns.boxplot(y = 'rooms', data = data_1, ax = axes[1])
sns.boxplot(y = 'bathroom', data = data_1, ax = axes[2])

In [None]:
plot.tight_layout()
sns.distplot(data_1['rent amount (R$)'],bins = 30)

In [None]:
# Data has categorical variable. so I am converting categorical variable into numeric.
data_1 = pd.get_dummies(data = data_1 , columns = ['furniture','animal'])
print(data_1.head())
print(len(data_1))

In [None]:
data_1.groupby('city').size()

In [None]:

# drop the city column.

data_1 = data_1.drop(['city'],axis = 1)

# Need to scale the dataset.

def normalize(x):
    return ((x- np.mean(x))/(max(x)-min(x)))

data_1 = data_1.apply(normalize)
data_1.head()


In [None]:
import seaborn as sns
fig,ax = plot.subplots(figsize=(10,10))
cal_corr = data_1.corr().round(2)
sns.heatmap(cal_corr,annot = True, linewidths = 1, ax=ax)

1. The above correlation matrix shows the relationship between variables ranges from -1 to 1.If value is close to 1 means there is strong relation between two variables.Here we are predicting the rent amount for the house. So We should take only those variables which has strong linear relationship with rent amount for Linear Regression model.  
2. The "area", "rooms","bathroom","parking space","fire insurance" has strong positive correlation with rent amount.
3. The "bathroom" is highly correlated with "rooms" and the "parking spaces" is also  correlated with other variables.This term is called as multicollinearity.We can check it further and take some actions using VIF.


# Linear Regression :

In [None]:
xData = pd.DataFrame(data_1[['area','rooms','bathroom','parking spaces','fire insurance (R$)','furniture_furnished','furniture_not furnished','hoa (R$)','total (R$)']], columns = ['area','rooms','bathroom','parking spaces','fire insurance (R$)','furniture_furnished','furniture_not furnished','hoa (R$)','total (R$)'])
print(xData.head())
yData = pd.DataFrame(data_1['rent amount (R$)'], columns = ['rent amount (R$)'])
print(yData.head())

Data is prepared for regression model. Let's split the data into train and test data.
we are splitting the data as 70% train  and 30% test data.

In [None]:
# import the library for spliting the data.
from sklearn.model_selection import train_test_split
x_train, x_test,y_train, y_test = train_test_split(xData,yData, train_size = 0.7, test_size = 0.3,random_state = 5)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
import statsmodels.api as sm

x_train = sm.add_constant(x_train)

lm_model1 = sm.OLS(y_train,x_train).fit()
print(lm_model1.summary())

our model is ready we have 98% of R-square, but the p value of rooms variable is >0.05 which is very high p value. So we
need to remove this variable.

In [None]:
x_train = x_train.drop(['rooms'],1)
lm_model2 = sm.OLS(y_train,x_train).fit()
print(lm_model2.summary())

In [None]:
x_train = x_train.drop(['fire insurance (R$)'],1)
lm_model3 = sm.OLS(y_train,x_train).fit()
print(lm_model3.summary())

Now I am taking last model lm_model3 for the predictions

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
ytrain_predic = lm_model3.predict(x_train)
rmse = (np.sqrt(mean_squared_error(y_train,ytrain_predic))).round(3)
r2 = r2_score(y_train,ytrain_predic).round(3)
print('RMSE for training data is : {}'.format(rmse))
print('r2 for training data is : {}'. format(r2))

# for test dataset we need to drop the columns which we drop during building the model.
x_test_model3 = sm.add_constant(x_test)
x_test_model3 = x_test_model3.drop(['rooms','fire insurance (R$)'], axis = 1)
ytest_predic = lm_model3.predict(x_test_model3)
rmse = (np.sqrt(mean_squared_error(y_test,ytest_predic))).round(3)
r2 = r2_score(y_test,ytest_predic).round(3)
print('RMSE for test data is : {}'.format(rmse))
print('r2 for test data is : {}'. format(r2))