#### This kernel used dataset from the Zillow Prize: Zillow’s Home Value Prediction and copied from the 'Zillow EDA On Missing Values & Multicollinearity' written by Vivek Srinivasan.

#### Introduction to 'Zillow EDA On Missing Values & Multicollinearity' : [URL](https://www.kaggle.com/viveksrinivasan/zillow-eda-on-missing-values-multicollinearity)

#### Thanks for sharing kernel, Vivek Srinivasan.

### The notebook covers following topics

- Missing Value Analysis
- Correlation Analysis
- Top Contributing Features (Through XGBoost)
- Correlation Analysis 
- Multicollinearity Analysis (reference info = [URL](https://ko.wikipedia.org/wiki/%EB%8B%A4%EC%A4%91%EA%B3%B5%EC%84%A0%EC%84%B1))
- Univariate Analysis (reference info = [URL](https://mansoostat.tistory.com/23))
- Bivariate Analysis

### Global Imports

In [None]:
import pylab
import calendar
import numpy as np
import pandas as pd
import seaborn as sn
from scipy import stats
import missingno as msno
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import kendalltau
import warnings
matplotlib.style.use('ggplot')
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore")
%matplotlib inline

### Reading in Dataset

In [None]:
train = pd.read_csv('../input/zillow-prize-1/train_2016_v2.csv',
                    parse_dates = ['transactiondate']) # parsing data as date type
properties = pd.read_csv('../input/zillow-prize-1/properties_2016.csv')

### Shape of the dataset

In [None]:
print('shape of train :', train.shape)
print('shape of properties :', properties.shape)

### Let's Merge train and properties to facilitate EDA

In [None]:
merged = pd.merge(train,properties,on="parcelid",how="left")

### First Few rows of data

In [None]:
merged.head(3).transpose().head(10)

### Missing Value Analysis

In [None]:
missingValueColumns = merged.columns[merged.isnull().any()].tolist()
msno.bar(merged[missingValueColumns],\
            figsize=(20,8),color="#34495e",fontsize=12,labels=True,)

In [None]:
msno.matrix(merged[missingValueColumns], width_ratios = (10, 1), \
           figsize=(20,8), fontsize = 12, sparkline=True, labels = True)

In [None]:
 msno.heatmap(merged[missingValueColumns],figsize=(20,20))

### Top Features Selection

In [None]:
from sklearn import model_selection, preprocessing
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

mergedFilterd = merged.fillna(-999)

for f in mergedFilterd.columns:
    if mergedFilterd[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(mergedFilterd[f].values))
        mergedFilterd[f] = lbl.transform(list(mergedFilterd[f].values))

train_y = mergedFilterd.logerror.values
train_x = mergedFilterd.drop(["parcelid", "transactiondate", "logerror"], axis=1)
        
xgb_params = {
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

dtrain = xgb.DMatrix(train_x, train_y, feature_names = train_x.columns.values)
# silent 0 means printing running messages, 1 means silent mode.
# right now, verbosity parameter works as silent
model = xgb.train(dict(xgb_params, verbosity=0), dtrain, num_boost_round=10)

In [None]:
featureImportance = model.get_fscore()
features = pd.DataFrame()
features['features'] = featureImportance.keys()
features['importance'] = featureImportance.values()
features.sort_values(by=['importance'], ascending = False, inplace = True)

fig, ax = plt.subplots()
fig.set_size_inches(20,10)
plt.xticks(rotation=90)
sn.barplot(data=features.head(15),x="importance",y="features",ax=ax,orient="h",color="#34495e")

### Correlation Analysis

In [None]:
topfeatures = features['features'].tolist()[:20]
corrmatt = merged[topfeatures].corr()
mask = np.array(corrmatt)
mask[np.tril_indices_from(mask)] = False
fig, ax = plt.subplots()
fig.set_size_inches(20, 10)
sn.heatmap(corrmatt, mask=mask, vmax = 0.8, square = True)

### Multicollinearity Analysis

In [None]:
# url : https://ysyblog.tistory.com/122
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')

def calculate_vif_(x):
    variables = list(x.columns)
    vif = {variable:variance_inflation_factor(exog=x.values, exog_idx=ix) for ix, variable in enumerate(list(x.columns))}
    return vif

numericalcol = []
for f in merged.columns:
    if merged[f].dtype != 'object' and f not in ["parcelid", "transactiondate", "logerror"]:
        numericalcol.append(f)
mergedfiltered = merged[numericalcol].fillna(-999)
vifdict = calculate_vif_(mergedfiltered)

vifdf = pd.DataFrame()
vifdf['variables'] = vifdict.keys()
vifdf['vifscore'] = vifdict.values()
vifdf.sort_values(by = ['vifscore'], ascending=False, inplace = True)
validvariables = vifdf[vifdf['vifscore']<=5]
variableswithmc = vifdf[vifdf['vifscore']>5]

fig, (ax1, ax2) = plt.subplots(ncols = 2)
fig.set_size_inches(20, 8)
sn.barplot(data = validvariables, x='vifscore', y='variables', ax=ax1, orient='h')
sn.barplot(data=variableswithmc.head(5),x="vifscore",y="variables",ax=ax2,orient="h",color="#34495e")
ax1.set(xlabel='VIF Scores', ylabel='Features',title="Valid Variables Without Multicollinearity")
ax2.set(xlabel='VIF Scores', ylabel='Features',title="Variables Which Exhibit Multicollinearity")

### Univariate Analysis
Dependent variable logerror follows nice normal distribution

In [None]:
ulimit = np.percentile(merged.logerror.values, 99)
llimit = np.percentile(merged.logerror.values, 1)
merged['logerror'].iloc[merged['logerror']>ulimit] = ulimit
merged['logerror'].iloc[merged['logerror']<llimit] = llimit

fig, ax = plt.subplots()
fig.set_size_inches(20, 5)
sn.distplot(merged.logerror.values, bins = 50, kde=False, ax=ax)
ax.set(xlabel = 'logerror', ylabel = 'vif score', title='distribution of dependent variable')

### Bivariate Analysis

In [None]:
train['year'] = train.transactiondate.map(lambda x: str(x).split('-')[0])
train['month'] = train.transactiondate.map(lambda x: str(x).split('-')[1])
train['day'] = train.transactiondate.map(lambda x: str(x).split('-')[2].split()[0])

traingroupedmonth = train.groupby(['month'])['logerror'].mean().to_frame().reset_index()
traingroupedday = train.groupby(['day'])['logerror'].mean().to_frame().reset_index()
fig, (ax1, ax2) = plt.subplots(nrows = 2)
fig.set_size_inches(20, 15)

sn.pointplot(x=traingroupedmonth['month'], y=traingroupedday['logerror'],
             data=traingroupedmonth, join = True, ax=ax1)
ax1.set(xlabel = 'month of the year', ylabel = 'log error', title ='average log error across month of 2016', label = 'big')

sn.countplot(x=train['month'], data = train, ax=ax2)
ax2.set(xlabel='Month Of The Year', ylabel='No Of Occurences',title="No Of Occurunces Across Month In 2016",label='big')

In [None]:
fig,(ax1,ax2)= plt.subplots(nrows=2)
fig.set_size_inches(20,15)

sn.pointplot(x=traingroupedday["day"], y=traingroupedday["logerror"], data=traingroupedday, join=True,ax=ax1,color="#34495e")
ax1.set(xlabel='Day Of The Month', ylabel='Log Error',title="Average Log Error Across Days Of The Month In 2016",label='big')

sn.countplot(x=train["day"], data=train,ax=ax2,color="#34495e")
ax2.set(xlabel='Day Of The Month', ylabel='No Of Occurences',title="No Of Occurences Across Days Of The Month In 2016",label='big')

### No Of Storey Over The Years
It is quite interesting to notice people started building more of 2 or 3 storey buildings After 1950

In [None]:
fig, ax1 = plt.subplots()
fig.set_size_inches(20, 10)
merged['yearbuilt'] = merged['yearbuilt'].map(lambda x:str(x).split('.')[0])
yearmerged = merged.groupby(['yearbuilt', 'numberofstories'])["parcelid"].count().unstack('numberofstories').fillna(0)
yearmerged.plot(kind='bar', stacked=True,ax=ax1)

In [None]:
cols = ["bathroomcnt","bedroomcnt","roomcnt","numberofstories","logerror","calculatedfinishedsquarefeet"]
mergedFiltered = merged[cols].dropna()
for col in cols:
    ulimit = np.percentile(mergedFiltered[col].values, 99.5)
    llimit = np.percentile(mergedFiltered[col].values, 0.5)
    mergedFiltered[col].iloc[mergedFiltered[col]>ulimit] = ulimit
    mergedFiltered[col].iloc[mergedFiltered[col]<llimit] = llimit

### Calculated Finished Square Feet Vs Log Error

In [None]:
plt.figure(figsize=(8, 8))
sn.jointplot(x = mergedFiltered.calculatedfinishedsquarefeet.values, 
             y = mergedFiltered.logerror.values, size = 10, kind = 'hex')
plt.ylabel('log error', fontsize = 12)
plt.xlabel('calculated finished square feet', fontsize = 12)
plt.show()

### Bedroom count vs log error

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(20, 5)
sn.boxplot(x='bedroomcnt', y='logerror', data = mergedFiltered, ax=ax)
ax.set(ylabel='log error', xlabel = 'bedroom count', title = 'bedroom count vs log error')

### Room Count Vs Log Error

In [None]:
fig,ax= plt.subplots()
fig.set_size_inches(20,5)
sn.boxplot(x="roomcnt", y="logerror", data=mergedFiltered,ax=ax,color="#34495e")
ax.set(ylabel='Log Error',xlabel="Room Count",title="Room Count Vs Log Error")

### No Of Storeys Vs Log Error

In [None]:
fig,ax= plt.subplots()
fig.set_size_inches(20,5)
sn.boxplot(x="numberofstories", y="logerror", data=mergedFiltered,ax=ax,color="#34495e")
ax.set(ylabel='Log Error',xlabel="No Of Storeys",title="No Of Storeys Vs Log Error")

### Bedroom Vs Bathroom Vs Log Error¶

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot
fig = pylab.figure()
fig.set_size_inches(20,10)
ax = Axes3D(fig)

ax.scatter(mergedFiltered.bathroomcnt, mergedFiltered.bedroomcnt, mergedFiltered.logerror)
ax.set_xlabel('Bathroom Count')
ax.set_ylabel('Bedroom Count')
ax.set_zlabel('Log Error');
pyplot.show()