# Sberbank Data exploration and Modelling

In [None]:
import pandas as pd 
import numpy as np
import math as m
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats
import statsmodels.api as sm


from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.model_selection import GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.metrics import mean_absolute_error, r2_score , mean_squared_error

#to randomly split data into train and test
from sklearn.model_selection import train_test_split
%matplotlib inline
seed=45

## Importing the train and test datasets 

In [None]:
traindf = pd.read_csv(path+"train.csv")
testdf = pd.read_csv(path+"test.csv")

print(traindf.shape)
print(testdf.shape)

THe test dataset will not be used in the course of this excercise and we will only submit our scores later on into the kaggle challenge using the test. Importing it now to check scores later

### We will be trying to predict the price of houses using all the other variables in the dataset. 

# Data Exploration and Feature Creation

The dataset also has a timestamp component to it.Let's visualize the price variable.

In [None]:
sns.distplot(traindf["price_doc"],bins = 100)
print(traindf["price_doc"].describe().apply(lambda x : format(x,'10.0f')))
print("\n\n\nPrice range skewness",stats.skew(traindf["price_doc"]))
plt.figure(figsize=(8,6))
plt.scatter(range(traindf.shape[0]), np.sort(traindf.price_doc.values))
plt.xlabel('index', fontsize=12)
plt.ylabel('price', fontsize=12)
plt.show()

In [None]:
list(np.percentile(traindf["price_doc"], np.arange(0, 100, 2)))

We see that the price range is postively skewed with a long tail, with 98% of the prices less than 20 mil. We remove these outliers later on 

## The timestamp variable might be important in viewing the whole problem. Let's look at it

In [None]:
print("NAs in Price Doc column",traindf["price_doc"].isnull().sum(),"\n")
print("Train Data")
print("min date:",traindf["timestamp"].min())
print("max date:",traindf["timestamp"].max())
print("number of nulls",traindf["timestamp"].isna().value_counts())

print("\nTest Data")
print("min date:",testdf["timestamp"].min())
print("max date:",testdf["timestamp"].max())
print("number of nulls",testdf["timestamp"].isna().value_counts())

print("\nTime variable type before conversion: ",traindf['timestamp'].dtype)

The test and the train datasets are split by time series. We have to learn from the past and predict the future

In [None]:
traindf["timestamp"]=pd.to_datetime(traindf['timestamp'])
testdf["timestamp"]=pd.to_datetime(testdf['timestamp'])
print("Time variable type : ",traindf['timestamp'].dtype)

traindf["year"] = traindf["timestamp"].dt.year
traindf["month"] = traindf["timestamp"].dt.month
testdf["year"] = testdf["timestamp"].dt.year
testdf["month"] = testdf["timestamp"].dt.month

Visualizing the timestamp variable across all dimensions like month, date and year to understand its distribution

In [None]:
mean = pd.DataFrame(traindf.groupby(traindf["year"])["price_doc"].agg('mean').apply(lambda x : format(x,'10.0f')))
mean["price_doc"]=mean["price_doc"].astype(int)
mean.reset_index(level=0, inplace=True)
sns.barplot(x="year",y="price_doc",data=mean, color="grey")

In [None]:
mean = pd.DataFrame(traindf.groupby([traindf["month"]])["price_doc"].agg('mean').apply(lambda x : format(x,'10.0f')))
mean["price_doc"]=mean["price_doc"].astype(int)
mean["month"]= mean.index
#mean.reset_index(level=0, inplace=True)
plt.figure(figsize=(30,10))
sns.barplot(x="month",y="price_doc",data=mean,color="grey")
plt.show()

In [None]:
mean = pd.DataFrame(traindf.groupby([traindf["year"],traindf["month"]])["price_doc"].agg('mean').apply(lambda x : format(x,'10.0f')))
mean["price_doc"]=mean["price_doc"].astype(int)
mean["year"]= mean.index
#mean.reset_index(level=0, inplace=True)
plt.figure(figsize=(40,20))
sns.barplot(x="year",y="price_doc",data=mean, color="grey")
plt.xlabel('index', fontsize=50)
plt.ylabel('price', fontsize=50)
plt.show()

#plt.bar(range(mean.shape[0]),mean.price_doc.values)
#plt.xlabel('index', fontsize=50)
#plt.ylabel('price', fontsize=50)
#plt.show()

It seems like the average price is steadily increasing every year from the first chart. Specific months don't make a difference I guess, but there is a
steady growth in price after the first 18 months..So a combination of year and month might be useful inputs

In [None]:
print("Train Shape",traindf.shape)
print("Test Shape",testdf.shape)

Okay, the dataset has 292 variables. We might not be able to visualize all variables, but we need ways to eliminate variables that do not significantly influence price. We could start by looking at 

1) Variables with high level of null values

2) Correlation and multicolleniarity to remove redundant variables that duplicate information using VIF

3) Significance of variables in predicting the outcome ; regression table output - but this might not be useful when we implement decision tree models

4) I read about Box Cox transformation to convert non normal data in normal form at https://www.statisticshowto.datasciencecentral.com/box-cox-transformation/ . Since regression is something we will be using a lot here, let us see if it is needed.

Let's see how many of the above we can implement

## 1) Eliminate variables with high levels of null values

In [None]:
nulltable = pd.DataFrame(traindf.isnull().sum()/traindf.shape[0]).reset_index()
nulltable.columns = ['column_name', 'missing_count']
non_nullcolumns = nulltable[nulltable['missing_count']==0]
nulltable.sort_values(by="missing_count",ascending = 0).head(20)

Highest degree of missing values is the hospital_beds_raion column with 47% . Let me remove variables with more than 10% missing values and impute -1 into the others.

In [None]:
null_columns_remove= list(nulltable[nulltable['missing_count']>.10]["column_name"])
print(traindf.shape)
print(testdf.shape)
traindf1 = traindf.drop(null_columns_remove,axis=1)
testdf1 = testdf.drop(null_columns_remove,axis=1)
print(traindf1.shape)
print(testdf1.shape)

There are now 259 variables in the data after removing variables with >10% null values

In [None]:
datatype = traindf1.dtypes.reset_index()
datatype.columns = ['column_name', 'datatype']
datatype["datatype"].value_counts()

In [None]:
table = pd.merge(nulltable,datatype, how="inner",on="column_name")
table["Null"] = np.where(table["missing_count"]==0,"No","Yes")
table["type"] = np.where(table["datatype"]=="object","Category","Number")
print("There are ",non_nullcolumns.shape[0],"columns with no null values and ",
     traindf1.shape[1]-non_nullcolumns.shape[0], "columns with null values that have been filled with -1 \n")
print(pd.crosstab(table.type,table["Null"]))

### Imputing null values with -1

In [None]:
traindf1.fillna(-1,inplace=True)
testdf1.fillna(-1,inplace=True)

## Encoding Categorical Variables

Below I will be looking at all the categorical variables to see how big the dataset will become once I encode it.

In [None]:
for i in table[table["datatype"]=="object"]["column_name"] :
    print("\n",i)
    print(traindf1[i].value_counts())

We will be dropping sub_area for now. While it might be useful, it has too many categories and might require a lot of time to weed through once encoded. Other columns have relatively lower categories and will be easy to look at.

In [None]:
traindf2 = traindf1.drop("sub_area",axis=1)
testdf2 = testdf1.drop("sub_area",axis=1)
print("After removing sub_area, shape reduced from ", traindf1.shape,"to",traindf2.shape)

In [None]:
traindf2 = pd.get_dummies(traindf2)
testdf2 = pd.get_dummies(testdf2)
print(traindf2.dtypes.value_counts())

So all categorical columns have now been converted into numerical columns and the data is more or less fit for any algorithm to parse

## 2)Checking multicolleniarity to remove redundant variables and reduce variance in prediction using VIF

How does multicollinearity affect the coeffcients in a regression problem and the final predictions?Why should you solve for multicollinearity?

So in a multiple linear regression problem, we equate y = Ax1+bx2 +c . 

y- dependant

x1 and x2 being the independant variables that are correlated

c being the intercept or constant.

When you say y = ax1 + bx2 , you mean that for a unit increase in x1, y increase by a, all else being constant. And this should be a true relationship for y and x1 irrespective of what other variables we use. But if x2 is correlated with x1, the constant a seems to be decided by the order of variables fed into the regressor and also by the number of predictors. Slight multicollinearity is tolerable as the coefficients are not strongly influnced, but high multicollinearity makes the coeffs erratic and unreliable. So the prediction might fluctuate based on the number and order of variables, which shouldn't be the case.

Hence, removing colinear variable is a need for a stable regression model.
 VIF seems to be the goto way of detecting multicollinearity. 
 VIF is calculated by 1/(1-R^2), R^2 coming out of every variable being predicted by the others using a regression model. So, R2 tells you how much of the variation in a variable is explained by the other variables, and the higher the R2, bigger the relationhip between independant variables, and ergo higher is the value of 1/(1-R2). Typical cutoff seems to be 5 for removing variables 
 on account of multicollinearity, 1-(1-.8) would give you 5, which means if a 80% of variation in a variable can be explained by other variables, it needs to be removed and not fed into a model

In [None]:
X = traindf2.drop(['id','timestamp','price_doc'], axis =1)
X = add_constant(X)
viftab = pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)

In [None]:
vifdf=pd.DataFrame(viftab).reset_index()
vifdf.columns = ["Name","VIF"]
vifdf.sort_values("VIF",ascending=False)
#vifdf["VIF"] = vifdf["VIF"]
#vifdf["VIF"] = vifdf["VIF"].apply(lambda x : x.strip())
#vifdf1=vifdf[vifdf["VIF"] != 'inf']
#vifdf1["VIF"] =pd.to_numeric(vifdf1["VIF"])
#vifdf1.sort_values("VIF",ascending=False)

noncollinearvar=list(vifdf[vifdf["VIF"]<10]["Name"])
noncollinearvar.remove('const')
print(noncollinearvar)
vifdf.dtypes

noncollinearvar = ['full_sq', 'floor', 'green_zone_part', 'indust_part', 'school_education_centers_top_20_raion', 'healthcare_centers_raion', 'university_top_20_raion', 'ID_metro', 'green_zone_km', 'industrial_km', 'cemetery_km', 'ID_railroad_station_walk', 'ID_railroad_station_avto', 'water_km', 'big_road1_km', 'ID_big_road1', 'ID_big_road2', 'ID_bus_terminal', 'church_synagogue_km', 'catering_km', 'green_part_500', 'prom_part_500', 'office_sqm_500', 'trc_count_500', 'trc_sqm_500', 'mosque_count_500', 'leisure_count_500', 'sport_count_500', 'market_count_500', 'trc_sqm_1000', 'mosque_count_1000', 'sport_count_1000', 'market_count_1000', 'trc_sqm_1500', 'mosque_count_1500', 'market_count_1500', 'trc_sqm_2000', 'mosque_count_2000', 'market_count_2000', 'mosque_count_3000', 'mosque_count_5000', 'year', 'month']

In [None]:
noncollinearvar = ['full_sq', 'floor', 'green_zone_part', 'indust_part', 'school_education_centers_top_20_raion', 'healthcare_centers_raion', 'university_top_20_raion', 'ID_metro', 'green_zone_km', 'industrial_km', 'cemetery_km', 'ID_railroad_station_walk', 'ID_railroad_station_avto', 'water_km', 'big_road1_km', 'ID_big_road1', 'ID_big_road2', 'ID_bus_terminal', 'church_synagogue_km', 'catering_km', 'green_part_500', 'prom_part_500', 'office_sqm_500', 'trc_count_500', 'trc_sqm_500', 'mosque_count_500', 'leisure_count_500', 'sport_count_500', 'market_count_500', 'trc_sqm_1000', 'mosque_count_1000', 'sport_count_1000', 'market_count_1000', 'trc_sqm_1500', 'mosque_count_1500', 'market_count_1500', 'trc_sqm_2000', 'mosque_count_2000', 'market_count_2000', 'mosque_count_3000', 'mosque_count_5000', 'year', 'month']

Note: There are some infinity values in the dataset.Need to be careful as sometimes the column is a string and other times it is float

In the above code, I am essentially trying to remove variables with VIF over 10.

THe following are the variables with VIF value less than 6.
['full_sq', 'floor', 'green_zone_part', 'indust_part', 'school_education_centers_top_20_raion', 'healthcare_centers_raion', 'university_top_20_raion', 'ID_metro', 'green_zone_km', 'industrial_km', 'cemetery_km', 'ID_railroad_station_walk', 'ID_railroad_station_avto', 'water_km', 'big_road1_km', 'ID_big_road1', 'ID_big_road2', 'ID_bus_terminal', 'church_synagogue_km', 'catering_km', 'green_part_500', 'prom_part_500', 'office_sqm_500', 'trc_count_500', 'trc_sqm_500', 'mosque_count_500', 'leisure_count_500', 'sport_count_500', 'market_count_500', 'trc_sqm_1000', 'mosque_count_1000', 'sport_count_1000', 'market_count_1000', 'trc_sqm_1500', 'mosque_count_1500', 'market_count_1500', 'trc_sqm_2000', 'mosque_count_2000', 'market_count_2000', 'mosque_count_3000', 'mosque_count_5000', 'year', 'month']

Is correlation/mulitcollinearity only important for regression problems?

https://datascience.stackexchange.com/questions/31402/multicollinearity-in-decision-tree

Found answers in the above link. Since decision trees anyway split based on one variable at a time and then looks at importance of the next based on gini or entropy, it is said that the correlated variables will anyway be ignored for non correlated ones, because if a correlated variable contains similar information, a better split is not achieved and there will be less info gain from that variable.

# Feature importance check

In the below code, I am checking feature importance just to understand how a decision tree views these variables 

In [None]:
X = traindf2[noncollinearvar]
Y = traindf2['price_doc']
param_grid = [{"max_depth":[5,8,10,12,15], "max_features":["sqrt","log2","auto"]}]
grid = GridSearchCV(GradientBoostingRegressor(),param_grid, cv=3, n_jobs =-1)
grid.fit(X,Y)
gb = grid.best_estimator_
gb.fit(X,Y)

In [None]:
plt.figure(figsize=(10,8))
pd.Series(gb.feature_importances_, index=X.columns).nlargest(30).plot(kind='barh')

#### The above table gives the order of important features based on the Decision Tree. It seems sensible. For instance, any person who starts looking for a house would first ask how big it is for the price. Followed by locality and amenities which is explained by variables like metro or not, school facility, water availability,religious building availability etc...

In [None]:
importantfeatures = pd.Series(gb.feature_importances_, index=X.columns).nlargest(20).reset_index()
importantfeatures.columns = ["columnname","importance"]
#columnlist= importantfeatures["columnname"]
columnlist = list(importantfeatures["columnname"])
columnlist

Note to self : There was a strong overlap between the variables remaining after checking for multicollinearity and the variables predicted as most important by the gradient boosted model on the original dataset. This probably means that the Decison Tree behind gradientboosting prioritizes variables that provide new information for a better split over variables that have redundant information( and hence are collinear)

# Checking Correlation

In [None]:
for i in noncollinearvar:
    print(stats.pearsonr(traindf2[i], traindf2["price_doc"]))
  #  print(stats.spearmanr(traindf1["full_sq"], traindf1["price_doc"]))

While correlation of most columns in weak, they all seem significant as e-06 is the lowest value

# Let's visualize the relationship between all these vaiables and price

In [None]:
plotdf=traindf2[(traindf2["full_sq"]<175) & (traindf2["price_doc"]<20000000)]
for i in noncollinearvar:
    sns.lmplot(i,"price_doc",data=plotdf,fit_reg=False)

# Outlier removal on specific variables based on the above charts and prior analysis 

### 1) Removing outliers in the price_doc and full_sq variables. 2% of the data has price above 20 mil  and bring in a big variance to the data. We do not want this 2% to influnce the model scores and therefore the final results. Hence, this data is removed. 

### 2)We are also removing full_sq >175 as this seems like the most important variable and outliers in this variable again might affect final predictions.

In [None]:
traindf3 = traindf2[(traindf2["full_sq"]<175) & (traindf2["price_doc"]<20000000)]

Extracting only the variables with low VIF scores as features

In [None]:
featuredata=traindf3[noncollinearvar]
testdf3=testdf2[noncollinearvar]
Y=traindf3["price_doc"]
#featuredata.drop("water_km",axis=1,inplace=True)
featuredata.shape

# Data Sampling. Test Train split creation from the train dataset given

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(featuredata,Y, test_size=0.3,random_state=seed)
print (xtrain.shape, ytrain.shape)
print (xtest.shape, ytest.shape)

# Model Evaluation Metrics:
Defining metrics that the model will be evaluated on:

We are evaluating the model on three metrics:

Root mean Square Log error:

In case of price predictions, the RMSLE is used when you do not want to penalize the model for high differences. As in when the actual and the prediction values are very large, relative small differences might be amplified when taking absolute mean square. Hence Log of the values are used to not penalize large differences.



Root mean Square error:

We are also looking at the residual errors to compare models.



R2 score:

R2 can be calculated in two ways. SSR/SST or 1 - SSE/SST.Both 
these options should yield that same value, provided the regression line 
predicts better than mean. But in this case, the prediction is so bad that 
SSE is higher than SST, meaning the regression line residuals are greater than
difference between actual points and their mean.

I am assuming the r2score function uses 1- SSE/SST, and therefore the R2 is -ve.

In [None]:
def RMSLE(y, y0):
    return np.sqrt(np.mean(np.square(np.log1p(y) - np.log1p(y0))))
    
def RMSE(y, y0):
    return np.sqrt(np.mean(np.square(y- y0)))
    
def r2(y,y0):
    return r2_score(y, y0)
    

## A basline OLS model 

In [None]:
model = sm.OLS(ytrain,xtrain)
results = model.fit()
print(results.summary())

# Inference:
    
    Since the variables are on different scales, the coefficients are not directly comparable to each other. But , looking at the most important variables in the model from the feature importance set, we infer  that :
    
    For 1 unit increase in full sq,the price increases by 541 units( 9.92 * e^4), all else being constant
    For 1 unit increase in floor,the price increase by 247 units( 4.50 * e^4), all else being constant
    For 1 unit increase in full sq,the price increase  by 175 units( 1.176e+05), all else being constant
    For 1 unit increase in big road,the price decreases by 235 units( -1.584e+05), all else being constant.
   
It is interesting to note that bigger the road, lower in the price of the house, which would not be your initial assumption. 

Now the reason for removing highly correlated variables and checking for multicollinearity is that, now these coefficients are reliable and will not vary much based on other variables being added, as there is no overlap of information in those variables. This helps with a clean inference

In [None]:
print(stats.pearsonr(traindf3["big_road1_km"], traindf3["price_doc"]))

Whilst not strong, there is indeed a negative correlation between the two variables

In [None]:
predictions = results.predict(xtest)
predictions.head()

In [None]:
LR_RMSLE = RMSLE(ytest,predictions)
LR_RMSE = RMSE(ytest,predictions)
LR_R2 = r2(ytest,predictions)

print("RMSLE :",LR_RMSLE,"\nRMSE:",LR_RMSE,"\nR2:",LR_R2)

Trying out ridge regression and Lasso regression here to reduce effect of coefficients of any one predictor to better generalize the model and not have it leaning on a single predictor

# Ridge Regression

In [None]:
ridgereg = Ridge(alpha=.1,normalize=True, max_iter=200)
ridgereg.fit(xtrain,ytrain)
ridgereg_pred = ridgereg.predict(xtest)

In [None]:
RR_RMSLE = RMSLE(ytest,ridgereg_pred)
RR_RMSE = RMSE(ytest,ridgereg_pred)
RR_R2 = r2(ytest,ridgereg_pred)

print("RMSLE :",RR_RMSLE,"\nRMSE:",RR_RMSE,"\nR2:",RR_R2)

# Lasso Regression

In [None]:
lassoreg = Lasso(alpha=.1,normalize=True, max_iter=200)
lassoreg.fit(xtrain,ytrain)
lassoreg_pred = lassoreg.predict(xtest)

In [None]:
LAR_RMSLE = RMSLE(ytest,lassoreg_pred)
LAR_RMSE = RMSE(ytest,lassoreg_pred)
LAR_R2 = r2(ytest,lassoreg_pred)

print("RMSLE :",LAR_RMSLE,"\nRMSE:",LAR_RMSE,"\nR2:",LAR_R2)

Most variables don't seem to have a linear relationship with price_doc. We
either have to transform them to check for linearity or use a non paramteric method. Let's try decision trees as they do not make assumptions about the distribution of the data.

# Random Forest

In [None]:
param_grid = [{"max_depth":[5,8,10], "max_features":["sqrt","log2","auto"]}]
grid = GridSearchCV(RandomForestRegressor(),param_grid, cv=3, n_jobs =-1)
grid.fit(xtrain,ytrain)
rf = grid.best_estimator_
rf.fit(xtrain,ytrain)

In [None]:
print("The Best hyperparamters of the model are",grid.best_params_)

In [None]:
rf_pred = rf.predict(xtest)
rf_RMSLE = RMSLE(ytest,rf_pred)
rf_RMSE = RMSE(ytest,rf_pred)
rf_R2 = r2(ytest,rf_pred)

print("RMSLE :",rf_RMSLE,"\nRMSE:",rf_RMSE,"\nR2:",rf_R2)

### Feature Importance chart for Random Forest

In [None]:
plt.figure(figsize=(10,8))
pd.Series(rf.feature_importances_, index=xtrain.columns).nlargest(30).plot(kind='barh')

# Gradient Boosting

Using Grid Search to tune hyperparameters of the model

In [None]:
param_grid = [{"max_depth":[5,8,10], "max_features":["sqrt","log2","auto"],"n_estimators":[100,200,300]}]
grid = GridSearchCV(GradientBoostingRegressor(),param_grid, cv=3, n_jobs =-1)
grid.fit(xtrain,ytrain)
gb = grid.best_estimator_
gb.fit(xtrain,ytrain)

In [None]:
print("The Best hyperparamters of the model are",grid.best_params_)

In [None]:
gb_pred = gb.predict(xtest)
gb_RMSLE = RMSLE(ytest,gb_pred)
gb_RMSE = RMSE(ytest,gb_pred)
gb_R2 = r2(ytest,gb_pred)

print("RMSLE :",gb_RMSLE,"\nRMSE:",gb_RMSE,"\nR2:",gb_R2)
#testgb_pred = gb.predict(testdf3)

### Feature importance chart for GB

In [None]:
plt.figure(figsize=(10,8))
pd.Series(gb.feature_importances_, index=xtrain.columns).nlargest(30).plot(kind='barh')

# Model Comparison based on the evaluation metrics

In [None]:
model_compare=[{"Name":'Linear Reg',"RMSLE":LR_RMSLE,"RMSE":LR_RMSE,"R2":LR_R2},
               {"Name":'Ridge Reg',"RMSLE":RR_RMSLE,"RMSE":RR_RMSE,"R2":RR_R2},
               {"Name":'Lasso Reg',"RMSLE":LAR_RMSLE,"RMSE":LAR_RMSE,"R2":LAR_R2},
               {"Name":'Random Forest',"RMSLE":rf_RMSLE,"RMSE":rf_RMSE,"R2":rf_R2},
               {"Name":'Gradient Boosting',"RMSLE":gb_RMSLE,"RMSE":gb_RMSE,"R2":gb_R2}]

model_comparedf = pd.DataFrame(model_compare)
model_comparedf =model_comparedf.set_index('Name')
print(model_comparedf)

## R2 Score Comparison

In [None]:
model_comparedf[["R2"]].plot(figsize=(7,3), xticks=range(0, 5)).legend(title='Name', bbox_to_anchor=(1, 1))

## RMSLE Comparison

In [None]:
model_comparedf[["RMSLE"]].plot(figsize=(7,3), xticks=range(0, 5)).legend(title='Name', bbox_to_anchor=(1, 1))

## RMSE Comparison

In [None]:
model_comparedf[["RMSE"]].plot(figsize=(7,3), xticks=range(0, 5)).legend(title='Name', bbox_to_anchor=(1, 1))

### Clearly, the Gradient Boosting Model has the highest R2 score and the lowest RMSE and RMSLE scores of all the models and outperforms all other models. Hence, the final prediction would be the ones coming out of the Gradient Boosted Model.