In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## Global Imports

import pylab
import calendar
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import missingno as msno
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import kendalltau
import warnings
matplotlib.style.use('ggplot')
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
## Reading in Dataset

train = pd.read_csv('../input/zillow-prize-1/train_2016_v2.csv', parse_dates = ["transactiondate"])   ## parse_dates: 데이터 불러올 때 해당 컬럼 datetime으로 변경 옵션
properties = pd.read_csv('../input/zillow-prize-1/properties_2016.csv')

In [None]:
## Shape Of The Dataset

print("Shape Of train: ", train.shape)
print("Shape Of Properties: ", properties.shape)

In [None]:
## Lets Merge Train And Properties To Facilitate EDA

merged = pd.merge(train, properties, on='parcelid', how='left')

In [None]:
## First Few Rows Of Data
merged.head(3).transpose()

In [None]:
## Visualizing Datatypes

# dataTypeDf = pd.DataFrame(merged.dtypes.value_counts()).reset_index().rename(columns={"index":"variableType",0:"count"})
# fig,ax = plt.subplots()
# fig.set_size_inches(20,5)
# sns.barplot(data=dataTypeDf,x="variableType",y="count",ax=ax,color="#34495e")
# ax.set(xlabel='Variable Type', ylabel='Count',title="Variables Count Across Datatype")

In [None]:
## Missing Value Analysis

missingValueColumns = merged.columns[merged.isnull().any()].tolist()
msno.bar(merged[missingValueColumns], figsize=(20,8), color="#34495e", fontsize=12, labels=True)

In [None]:

msno.matrix(merged[missingValueColumns],width_ratios=(10,1),
            figsize=(20,8),color=(0,0, 0),fontsize=12,sparkline=True,labels=True)

In [None]:
msno.heatmap(merged[missingValueColumns], figsize=(20,20))

In [None]:
## Top Features Selection

from sklearn import model_selection, preprocessing
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

mergedFilterd = merged.fillna(-999)
for f in mergedFilterd.columns:
    if mergedFilterd[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(mergedFilterd[f].values))
        mergedFilterd[f] = lbl.transform(list(mergedFilterd[f].values))
        
        
train_y = mergedFilterd.logerror.values
train_X = mergedFilterd.drop(["parcelid", "transactiondate", "logerror"], axis=1)

xgb_params = {
    'eta': 0.05,
    'max_depth':8,
    'subsample':0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',

}

dtrain = xgb.DMatrix(train_X, train_y, feature_names=train_X.columns.values)
model = xgb.train(dict(xgb_params), dtrain, num_boost_round=100)

In [None]:
featureImportance = model.get_fscore()
features = pd.DataFrame()
features['features'] = featureImportance.keys()
features['importance'] = featureImportance.values()
features.sort_values(by=['importance'],ascending=False,inplace=True)
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
plt.xticks(rotation=90)
sns.barplot(data=features.head(15),x="importance",y="features",ax=ax,orient="h",color="#34495e")

In [None]:
## Correlation Analysis

topFeatures = features["features"].tolist()[:20]
corrMatt = merged[topFeatures].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig, ax = plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(corrMatt, mask=mask, vmax=.8, square=True)

In [None]:
## Multicollinearity Analysis (다중공선성 분석)


from statsmodels.stats.outliers_influence import variance_inflation_factor  
import warnings
warnings.filterwarnings("ignore")

def calculate_vif_(X):
    variables = list(X.columns)
    vif = {variable:variance_inflation_factor(exog=X.values, exog_idx=ix) for ix,variable in enumerate(list(X.columns))}
    return vif


numericalCol = []
for f in merged.columns:
    #print (f)
    if merged[f].dtype!='object' and f not in ["parcelid", "transactiondate", "logerror"]:
        numericalCol.append(f)
mergedFilterd = merged[numericalCol].fillna(-999)
vifDict = calculate_vif_(mergedFilterd)

vifDf = pd.DataFrame()
vifDf['variables'] = vifDict.keys()
vifDf['vifScore'] = vifDict.values()
vifDf.sort_values(by=['vifScore'],ascending=False,inplace=True)
validVariables = vifDf[vifDf["vifScore"]<=5]
variablesWithMC  = vifDf[vifDf["vifScore"]>5]

fig,(ax1,ax2) = plt.subplots(ncols=2)
fig.set_size_inches(20,8)
sns.barplot(data=validVariables,x="vifScore",y="variables",ax=ax1,orient="h",color="#34495e")
sns.barplot(data=variablesWithMC.head(5),x="vifScore",y="variables",ax=ax2,orient="h",color="#34495e")
ax1.set(xlabel='VIF Scores', ylabel='Features',title="Valid Variables Without Multicollinearity")
ax2.set(xlabel='VIF Scores', ylabel='Features',title="Variables Which Exhibit Multicollinearity")

In [None]:
## Univariate Analysis
## Dependent variable logerror follows nice normal distribution

ulimit = np.percentile(merged.logerror.values, 99)
llimit = np.percentile(merged.logerror.values, 1)
merged['logerror'].iloc[merged['logerror']>ulimit] = ulimit
merged['logerror'].iloc[merged['logerror']<llimit] = llimit

fig,ax = plt.subplots()
fig.set_size_inches(20,5)
sns.distplot(merged.logerror.values, bins=50,kde=False,color="#34495e",ax=ax)
ax.set(xlabel='logerror', ylabel='VIF Score',title="Distribution Of Dependent Variable")

In [None]:
## Bivariate Analysis

train["year"] = train.transactiondate.map(lambda x: str(x).split("-")[0])
train["month"] = train.transactiondate.map(lambda x: str(x).split("-")[1])
train["day"] = train.transactiondate.map(lambda x: str(x).split("-")[2].split()[0])

traingroupedMonth = train.groupby(["month"])["logerror"].mean().to_frame().reset_index()
traingroupedDay = train.groupby(["day"])["logerror"].mean().to_frame().reset_index()
fig,(ax1,ax2)= plt.subplots(nrows=2)
fig.set_size_inches(20,15)


sns.pointplot(x=traingroupedMonth["month"], y=traingroupedMonth["logerror"], data=traingroupedMonth, join=True,ax=ax1,color="#34495e")
ax1.set(xlabel='Month Of The Year', ylabel='Log Error',title="Average Log Error Across Month Of 2016",label='big')

sns.countplot(x=train["month"], data=train,ax=ax2,color="#34495e")
ax2.set(xlabel='Month Of The Year', ylabel='No Of Occurences',title="No Of Occurunces Across Month In 2016",label='big')

In [None]:
fig,(ax1,ax2)= plt.subplots(nrows=2)
fig.set_size_inches(20,15)

sns.pointplot(x=traingroupedDay["day"], y=traingroupedDay["logerror"], data=traingroupedDay, join=True,ax=ax1,color="#34495e")
ax1.set(xlabel='Day Of The Month', ylabel='Log Error',title="Average Log Error Across Days Of The Month In 2016",label='big')

sns.countplot(x=train["day"], data=train,ax=ax2,color="#34495e")
ax2.set(xlabel='Day Of The Month', ylabel='No Of Occurences',title="No Of Occurences Across Days Of The Month In 2016",label='big')

In [None]:
## No Of Storey Over The Years
## It is quite interesting to notice people started building more of 2 or 3 storey buildings After 1950

fig,ax1= plt.subplots()
fig.set_size_inches(20,10)
merged["yearbuilt"] = merged["yearbuilt"].map(lambda x:str(x).split(".")[0])
yearMerged = merged.groupby(['yearbuilt', 'numberofstories'])["parcelid"].count().unstack('numberofstories').fillna(0)
yearMerged.plot(kind='bar', stacked=True,ax=ax1)

In [None]:
cols = ['bathroomcnt', 'bedroomcnt', 'roomcnt', 'numberofstories', 'logerror', 'calculatedfinishedsquarefeet']

mergedFiltered = merged[cols].dropna()
for col in cols:
    ulimit = np.percentile(mergedFiltered[col].values, 99.5)
    llimit = np.percentile(mergedFiltered[col].values, 0.5)
    mergedFiltered[col].iloc[mergedFiltered[col]>ulimit] = ulimit
    mergedFiltered[col].iloc[mergedFiltered[col]<llimit] = llimit

In [None]:
## Calculated Finished Square Feet vs Log error

plt.figure(figsize=(8,8))
sns.jointplot(x=mergedFiltered.calculatedfinishedsquarefeet.values, y=mergedFiltered.logerror.values, size=10, kind='hex', color="#34495e")
plt.ylabel('Log Error', fontsize=12)
plt.xlabel('Calculated Finished Square Feet', fontsize=12)
plt.show()

In [None]:
## Bedroom Count vs Log Error

fig,ax= plt.subplots()
fig.set_size_inches(20,5)
sns.boxplot(x="bedroomcnt", y="logerror", data=mergedFiltered,ax=ax,color="#34495e")
ax.set(ylabel='Log Error',xlabel="Bedroom Count",title="Bedroom Count Vs Log Error")

In [None]:
## Bathroom Count vs Log Error
fig, ax = plt.subplots()
fig.set_size_inches(20,5)
sns.boxplot(x="bathroomcnt", y='logerror', data=mergedFiltered, ax=ax, color="#34495e")
ax.set(ylabel='Log Error', xlabel="Bathroom Count", title="Bathroom Count vs Log Error|")

In [None]:
## Room Count vs Log Error
fig, ax = plt.subplots()
fig.set_size_inches(20,5)
sns.boxplot(x="roomcnt", y="logerror", data=mergedFiltered, ax=ax, color='#34495e')
ax.set(ylabel='Log Error', xlabel="Room Count", title="Room Count vs Log Error")

In [None]:
## No Of Storeys vs Log Error
fig, ax = plt.subplots()
fig.set_size_inches(20,5)
sns.boxplot(x="numberofstories", y="logerror", data=mergedFiltered,ax=ax,color="#34495e")
ax.set(ylabel='Log Error',xlabel="No Of Storeys",title="No Of Storeys Vs Log Error")

In [None]:
## Bedroom VS Bathroom vs Log Error

from mpl_toolkits.mplot3d import Axes3D
#from matplotlib import pyplot
import matplotlib.pyplot as plt
from matplotlib import cm

fig = plt.figure(figsize=(20,10))

ax = fig.add_subplot(111, projection='3d')
ax.plot(mergedFiltered.bathroomcnt, mergedFiltered.bedroomcnt, mergedFiltered.logerror, linestyle="none", marker="o", mfc="none", markeredgecolor="red")

plt.show()

