**Predicting rent of apartments based on various parameters.**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px
#input
data = pd.read_csv('../input/apartment-rental-offers-in-germany/immo_data.csv')

pd.options.display.max_columns = None

**DATA PREPROCESSING**


Exploring the Data Set

In [None]:

data.head(10)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
#deleting outliers
data = data[data.baseRent.between(50,10000, inclusive=True)]
data = data[data.noRooms.between(0,15, inclusive=True)]
data = data[data.livingSpace.between(50,30000, inclusive=True)]
data.shape

In [None]:
#plot the data against base_rent(target_variable)
plt.figure(figsize=(20,10))

sns.barplot(x=data.regio1, y=data.baseRent)
plt.show()

In [None]:
g = sns.FacetGrid(data, col='heatingType', col_wrap=4)
g = g.map(plt.hist, 'baseRent', bins=20, range=(100,4000))

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(x='yearConstructed', y='baseRent', data=data)
plt.title('Price by Year of Construction')
plt.xlabel('Year of Construction')
plt.ylabel('Price')
plt.show()

In [None]:
sns.histplot(data = data, x="baseRent",bins=30)

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sns.displot(np.log(data["baseRent"]),kde_kws={"label": 'livingSpace'}, bins=20)

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sns.displot(np.log(data["livingSpace"]),kde_kws={"label": 'livingSpace'}, bins=20)

In [None]:
plt.figure(figsize=(20,10))

sns.barplot(x=data.condition, y=data.baseRent)
plt.show()

In [None]:
plt.figure(figsize=(5,5))

sns.barplot(x=data.interiorQual, y=data.baseRent)
plt.show()

In [None]:
plt.figure(figsize=(15,5))

sns.barplot(x=data.typeOfFlat, y=data.baseRent)
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sns.regplot(x='noRooms', y='baseRent', data=data)


In [None]:
data.info()

In [None]:
#checking missing values count
data.isnull().sum()/218354 * 100

In [None]:
#dropping columns containing more than 60 percent missing values
for column in data:
    if (data[column].isnull().sum()/218354 * 100) > 60:
        data.drop(column,axis=1,inplace = True)

In [None]:
data.info()

In [None]:
#dropping unwanted columns 
data.drop(['baseRentRange','yearConstructedRange','houseNumber','noRoomsRange','livingSpaceRange','street','streetPlain','firingTypes'
          ,'thermalChar','geo_krs','geo_bln','regio2','regio3'],axis=1,inplace = True)

In [None]:
data.drop(['picturecount','scoutId','pricetrend'],axis=1,inplace = True)

In [None]:
data.drop(['serviceCharge','telekomTvOffer','description','facilities','date','totalRent','petsAllowed','numberOfFloors','telekomUploadSpeed'],axis=1,inplace = True)

In [None]:
data.info()

In [None]:
#creating a new feature:zip code[which will be 1 for rich cities and 0 for poor cities]
data['geo_plz'].value_counts().tail(3002)
others = list(data['geo_plz'].value_counts().tail(3002).index)
def edit_zip(x):
    if x in others:
        return 0
    else:
        return x

data['zipcode'] = data['geo_plz'].apply(edit_zip)

In [None]:
zipcode = data.groupby('geo_plz')['baseRent'].mean()

cities = list(zipcode.sort_values(ascending = False).head(2200).index)


In [None]:
def city(x):
    if x in cities:
        return 1
    else:
        return 0

data['zipcode'] = data['geo_plz'].apply(city)

In [None]:
data['zipcode'].value_counts()

In [None]:
#missing value count
data.isnull().sum()

In [None]:
#filling the missing value for 'heatingType'
data['heatingType'].value_counts()

In [None]:
data['heatingType'].fillna("central_heating",inplace=True)

In [None]:
#filling the missing value for 'condition'
data['condition'].value_counts()

In [None]:
data['condition'].fillna("null",inplace=True)
data.groupby('condition')['baseRent'].mean()

In [None]:
for index, row in data.iterrows():
    if ((row["newlyConst"] == True) & (row["condition"]=="null")):
        data.loc[index,['condition']] = "first_time_use"
    elif ((row["newlyConst"] == False) & (row["condition"]=="null") & (row["baseRent"]>=800)):
        data.loc[index,['condition']] = "fully_renovated"
    elif ((row["zipcode"] == 1) & (row["condition"]=="null") & (row["baseRent"]< 500)):
        data.loc[index,['condition']] = "need_of_renovation"
    elif (row["condition"]=="null"):
        data.loc[index,['condition']] = "well_kept"

In [None]:
data['condition'].value_counts()

In [None]:
#filling the missing value for 'floor'
floor_mean = data['floor'].mean()
data['floor'].fillna(floor_mean,inplace=True)

In [None]:
#filling the missing value for 'interiorQual'
data['interiorQual'].value_counts()

In [None]:
data.groupby('interiorQual')['baseRent'].mean()

In [None]:
data['interiorQual'].fillna("null",inplace=True)
data['interiorQual'].value_counts()

In [None]:
data[(data['interiorQual']=="luxury") & (data['zipcode']==0)]['baseRent'].mean()

In [None]:
data[(data['interiorQual']=="sophisticated") & (data['zipcode']==0)]['baseRent'].mean()

In [None]:
data[(data['interiorQual']=="normal") & (data['zipcode']==0)]['baseRent'].mean()

In [None]:
data[(data['interiorQual']=="simple") & (data['zipcode']==0)]['baseRent'].mean()


In [None]:
data[(data['interiorQual']=="luxury") & (data['zipcode']==1)]['baseRent'].mean()

In [None]:
data[(data['interiorQual']=="sophisticated") & (data['zipcode']==1)]['baseRent'].mean()

In [None]:
data[(data['interiorQual']=="normal") & (data['zipcode']==1)]['baseRent'].mean()

In [None]:
data[(data['interiorQual']=="simple") & (data['zipcode']==1)]['baseRent'].mean()


In [None]:
for index, row in data.iterrows():
    if ((row["zipcode"] == 1) & (row["interiorQual"]=="null")& (row["baseRent"]>= 1700)):
        data.loc[index,['interiorQual']] = "luxury"
    elif ((row["zipcode"] == 1) & (row["interiorQual"]=="null")& ( 1700 < row["baseRent"] <= 1000)):
        data.loc[index,['interiorQual']] = "sophisticated"
    elif ((row["zipcode"] == 0) & (row["interiorQual"]=="null")& (row["baseRent"]>= 800)):
        data.loc[index,['interiorQual']] = "luxury"
    elif ((row["zipcode"] == 0) & (row["interiorQual"]=="null")& ( 500 < row["baseRent"] <= 800)):
        data.loc[index,['interiorQual']] = "sophisticated"
    elif (row["interiorQual"]=="null"):
        data.loc[index,['interiorQual']] = "normal"

In [None]:
data['interiorQual'].value_counts()

In [None]:
#filling the missing value for 'typeOfFlat'
data['typeOfFlat'].value_counts()

In [None]:
data.groupby('typeOfFlat')['baseRent'].mean()

In [None]:
data['typeOfFlat'].fillna("apartment",inplace=True)


In [None]:
data.drop(['yearConstructed','geo_plz'],axis=1,inplace = True)


In [None]:
data.head()

In [None]:
#converting categorical values
columns = []
for cols in data.columns:
    if data[cols].dtype == 'object':
        columns.append(cols)
        
columns

In [None]:
dummies_feature = pd.get_dummies(data[columns])
dummies_feature.head()

In [None]:
data = pd.concat([data, dummies_feature], axis=1)
data.head()

In [None]:
data.drop(['regio1','heatingType','condition','interiorQual','typeOfFlat'],axis=1,inplace = True)


In [None]:
y = np.log(data['baseRent'])
x=data.copy()
x.drop(['baseRent'],axis=1,inplace = True)


#In order to build the model and to find appropriate hyperparameters for GBM, we conducted Random Search on an IDE. 

Training three models: **Linear Regression, RandomForestRegressor, GradientBoostingRegressor** 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, 
                                                    random_state=1)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import math
import pickle

prediction = []
def linearregression(xtrain, ytrain, xtest, ytest):
    linreg = LinearRegression()
    linreg.fit(xtrain, ytrain)
    y_pred = linreg.predict(xtest)
    
    #Saving the model
    Pkl_Filename = "LR_Model.pkl"  
    with open(Pkl_Filename, 'wb') as file:  
        pickle.dump(linreg, file)
        
    print('MAE:', metrics.mean_absolute_error(ytest, y_pred))
    print('MSE:', metrics.mean_squared_error(ytest, y_pred))
    print('R2_score:', metrics.r2_score(ytest, y_pred))
    plt.scatter(y_pred,ytest)
    plt.xlabel('predicted value of y')
    plt.ylabel('y')
    plt.figure()
linearregression(X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

def RandomForest(xtrain, ytrain, xtest, ytest):
    randomForest = RandomForestRegressor()
    randomForest.fit(xtrain, ytrain)
    y_pred = randomForest.predict(xtest)
    
     #Saving the model
    Pkl_Filename = "RF_Model.pkl"  
    with open(Pkl_Filename, 'wb') as file:  
        pickle.dump(randomForest, file)
        
    print('MAE:', metrics.mean_absolute_error(ytest, y_pred))
    print('MSE:', metrics.mean_squared_error(ytest, y_pred))
    print('R2_score:', metrics.r2_score(ytest, y_pred))

    plt.scatter(y_pred,ytest)
    plt.xlabel('predicted value of y')
    plt.ylabel('y')
    plt.figure()

RandomForest(X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

#Best hyperparameters from Random Search:
#maxdepth: 16, minsamleaf: 117, n: 73, maxfeat: 10, lr: 0.07
def gradientboostingmachine(md, msl, n, mf, lr, xtrain, ytrain, xtest, ytest):
    gbm_best = GradientBoostingRegressor(n_estimators=n, random_state=1111,
                                         max_depth=md, max_features=mf, 
                                         min_samples_leaf=msl, learning_rate=lr
                                         )
    gbm_best.fit(xtrain, ytrain)
    y_pred_gbm = gbm_best.predict(xtest)
    
        
    #Saving the model
    Pkl_Filename = "GB_Model.pkl"  
    with open(Pkl_Filename, 'wb') as file:  
        pickle.dump(gbm_best, file)
        
    print('MAE:', metrics.mean_absolute_error(ytest, y_pred_gbm))
    print('MSE:', metrics.mean_squared_error(ytest, y_pred_gbm))
    print('R2_score:', metrics.r2_score(ytest, y_pred_gbm))

    plt.scatter(y_pred_gbm,ytest)
    plt.xlabel('predicted value of y')
    plt.ylabel('y')
    plt.figure()
gradientboostingmachine(16, 117, 73, 10, 0.07, X_train, y_train, X_test, y_test) 

Among the three models, **R2_score is the highest for GradientBoosting.**