In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns
from warnings import filterwarnings as filt
from scipy.stats import skew, norm 

filt('ignore')
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12,6)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
cols = {
    '0': 'rent', 
    '1': 'title',
    '2': 'bedrooms',
    '3': 'bathrooms',
    '4': 'sqft',
    '5': 'location',
    '6': 'details',
    '7': 'amenity'
}

In [None]:
df = pd.read_csv('/kaggle/input/scraped-property-details-99co/Property details.csv')
df = df.drop(['Unnamed: 0'], axis = 1)
df = df.rename(columns = cols)
df.head()

In [None]:
df.isnull().sum()

### data cleaning 

In [None]:
import re

def get_num(x):
    return int(''.join(re.findall(r'[0-9]', x)))

df.rent.apply(get_num).head()

In [None]:
num_cols = ['rent','bedrooms', 'bathrooms', 'sqft']

#### rent

In [None]:
# df[df.rent.str.contains()]
df.rent.apply(lambda x : x.split('/')[-1]).unique()

all are monthly rent

#### bedrooms

In [None]:
df.bedrooms.unique()

studio apartments are basically a self-contained unit and houses everything in the single room space except the bathroom

In [None]:
df[df.bedrooms.str.contains('-')]

in the title its given 1 bedroom for all the -1 bedrooms, and studio for - bedrooms

In [None]:
df['bedrooms'] = df.bedrooms.replace({'-' : '0 Beds', 
                                      'Studio' : '0 Beds', 
                                      '-1 Beds' : '1 Bed'})

In [None]:
df.bedrooms.unique()

#### bathrooms

In [None]:
df.bathrooms.unique()

In [None]:
df[df.bathrooms.str.contains('Baths')].head()

In [None]:
df[df.bathrooms.str.contains('sqft')].head()

* let's assume bathrooms features having sqft as samples is = sqft/1000 bathrooms
* assuming each bathrooms are 1000 sqft

In [None]:
# df[(df.bathrooms.str.contains('sqft')) & (df.sqft.str.contains('land'))
# df[df.bathrooms > df.bedrooms]
# .apply(lambda x : x[0])
# df.loc[idx] = df.loc

idx = df.bathrooms[df.bathrooms.str.contains('sqft')].index
df.loc[idx, 'bathrooms'] = df.loc[idx, 'bathrooms'].apply(lambda x : '1 Bath' if int(get_num(x) / 1000) == 0 else f"{int(get_num(x) / 1000)} Baths")

In [None]:
df.bathrooms.unique()

#### sqft

In [None]:
sqftc = df.sqft.str.split(expand = True)
sqftc.head()

In [None]:
sqftc[1].unique()

In [None]:
sqftc[2].unique()

#### rest

In [None]:
df[num_cols].head()

In [None]:
for col in num_cols:
    print(col)
    df[col] = df[col].apply(get_num)
    
df.head()

#### location

In [None]:
df.location.unique()

In [None]:
df[df.location == '[]'].head()

In [None]:
df[df.location == '[]'].shape[0] / df.shape[0] 

In [None]:
idx = df[df.location == '[]'].index
df = df.drop(idx)
# df.head()

In [None]:
df['district'] = df.location.apply(lambda x : get_num(x.split()[-1]))
df['location'] = df.location.apply(lambda x : x.split(' (')[0])

#### amenity

In [None]:
from collections import Counter

def split(x, return_len = False):
    ams = re.sub(r"\[|\]|'",'',x).split(', ')
    return len(ams) if return_len else ams

all_amenity = []
# df.amenity[0], re.sub(r'\[|\]','',df.amenity[0])

In [None]:
for a in df.amenity:
    all_amenity = all_amenity + split(a)
    
count = Counter(all_amenity)

In [None]:
count.most_common(15)

lets take the most common amenity as features 

In [None]:
# df.amenity.apply(lambda x : split(x, return_len= True))
df['total_amenity'] = df.amenity.apply(lambda x : split(x, return_len= True))

In [None]:
# df[df.amenity.str.contains('gym', case = False)]

In [None]:
new_feats = ['gym', 'security', 'parking', 'swimming pool', 'aircon']
df[new_feats] = 0
for col in new_feats:
    idx = df[df.amenity.str.contains(col, case = False)].index
    df.loc[idx, col] = 1

In [None]:
for col in new_feats:
    print(col.center(60, '='))
    print(df[col].value_counts())
    print()

In [None]:
df.head()

In [None]:
useless_feats = df[['title','details','amenity']]
df = df.drop(useless_feats.columns, axis = 1)

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['location'] = encoder.fit_transform(df.location)
df.head()

In [None]:
sns.distplot(df.rent, fit = norm)

In [None]:
sns.distplot(np.log1p(df.rent), fit = norm)

In [None]:
original_rent = df.rent
df['rent'] = np.log1p(df.rent)

In [None]:
sns.heatmap(df.corr(), fmt = '.1f', annot = True)

bedrooms and bathrooms are high correlated, since for most of the houses, number of bathrooms == number of bedrooms

In [None]:
df.corrwith(df.rent).sort_values(ascending = False)

In [None]:
import eli5 
from eli5.sklearn import PermutationImportance
from sklearn.feature_selection import mutual_info_regression
from pdpbox import pdp
from sklearn.ensemble import RandomForestRegressor

In [None]:
def permImp(x, y):
    model = RandomForestRegressor().fit(x, y)
    perm = PermutationImportance(model).fit(x, y)
    return eli5.show_weights(perm, feature_names = x.columns.tolist())

def isolate(x, y, col):
    model = RandomForestRegressor().fit(x, y)
    pdp_dist = pdp.pdp_isolate(model, dataset = x, model_features = x.columns, feature = col)
    return pdp.pdp_plot(pdp_dist, feature_name = col)

def plot_mi(score):
    score = score.sort_values('mi_score', ascending = True)
    plt.barh(score.index, score.mi_score)
    plt.title('mutual info regressor for x feats')
    return 

def mi_score(x, y):
    score = pd.DataFrame(mutual_info_regression(x, y, discrete_features = False), index = x.columns, columns = ['mi_score']).sort_values('mi_score', ascending = False)
    plot_mi(score)
    return score

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df.drop(['rent'], axis = 1)
y = df.rent
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

#### permutation importance 

In [None]:
permImp(x_train, y_train)

#### mutual info regressor 

In [None]:
mscore = mi_score(x_train, y_train)

according to mutal info regressor all these features are important 

In [None]:
isolate(x_train, y_train, 'sqft')

according to the partial dependence plot, sqft and rent price are positivly correlated which is obvious 

In [None]:
sns.scatterplot(data = df, x = 'sqft', y = 'rent', hue = 'bedrooms')

### model building 

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline


In [None]:
def best_model(x, y):
    models = [LinearRegression(), Lasso(), Ridge(), SVR(), RandomForestRegressor(), XGBRegressor(), LGBMRegressor()]
    names = ['linear regg', 'lasso', 'ridge', 'svmr', 'random forest', 'xgboost regg', 'lightgbm regg']
    scalers = [None,StandardScaler(), RobustScaler(), MinMaxScaler()]
    snames = ['None','std', 'robust', 'min_max']
    scores = [[] for _ in range(len(scalers))]
    for model in models:
        for ind, scaler in enumerate(scalers):
            if scaler:
                model = Pipeline(steps = [('scaler', scaler), ('model', model)], verbose = 1)
            score = cross_val_score(model, x, y, cv = 5, scoring = 'neg_mean_squared_error').mean()
            score = -1 * score
            rmse = np.sqrt(score)
            scores[ind].append(rmse)
            
    return pd.DataFrame(scores, index = snames, columns = names).T

def get_score(xt, yt, xtest, ytest, model, scaler = None):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)], verbose = 1)
    model.fit(xt, yt)
    pred = model.predict(xtest)
    print(' Results '.center(70,'='))
    print()
    print(f"Training score :===> {model.score(xt,yt)}")
    print(f"Testing score  :===> {model.score(xtest,ytest)}")
    print(f"MSE            :===> {mean_squared_error(ytest, pred)}")
    print(f"MAE            :===> {mean_absolute_error(ytest, pred)}")
    
def gridcv(x, y, model, params, scaler = None):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)], verbose = 1)
    clf = GridSearchCV(model, param_grid = params, cv = 5, return_train_score = True, scoring = 'neg_mean_squared_error', verbose = 1)
    clf.fit(x, y)
    res = pd.DataFrame(clf.cv_results_)
    res[['mean_train_score','mean_test_score']] = np.sqrt(-1 * res[['mean_train_score','mean_test_score']])
    return clf.best_estimator_, clf.best_params_, res[['mean_train_score','mean_test_score', 'params']].sort_values('mean_test_score', ascending = True)

In [None]:
get_score(x_train, y_train, x_test, y_test, LinearRegression())

In [None]:
best_model(x_train, y_train)

In [None]:
get_score(x_train, y_train, x_test, y_test, RandomForestRegressor())

In [None]:
params = {
    'n_estimators' : [100,200,300],
    'max_depth' : [None,8, 15, 20],
    'bootstrap' : ['True', 'False'],
}
clf, best_params, results = gridcv(x_train, y_train, RandomForestRegressor(), params)

In [None]:
sns.lineplot(data = results, x = results.reset_index().index, y = 'mean_train_score')
sns.lineplot(data = results, x = results.reset_index().index, y = 'mean_test_score')
plt.title('RMSE for train and test')
plt.legend(['train error', 'test error'])

In [None]:
results.head()

In [None]:
best_params

In [None]:
get_score(x_train, y_train, x_test, y_test, clf)

lets try dropping high correlated feats 

In [None]:
sns.heatmap(df.corr(), fmt = '.1f', annot = True)

In [None]:
drop = ['bedrooms', 'security']
x_train = x_train.drop(drop, axis = 1)
x_test = x_test.drop(drop, axis = 1)

In [None]:
get_score(x_train, y_train, x_test, y_test, clf)

it did worse now 