# Explantory Data Analysis

## Python Packages Used 

In [1]:
# Libraries for importing dataaset and statistical analysis 
import pandas as pd
import numpy as np
import pandas.io.sql as pd_sql
import sqlite3 as sql
import pandas as pd
import csv 

# Libraries for visualization and feature selection 
%matplotlib inline
import seaborn as sns 
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt


# Libraries for label encoding 

from sklearn.preprocessing import LabelEncoder

# Libraries for modelling 

from sklearn import cross_validation as cv
from sklearn.cross_validation import train_test_split as tts

from sklearn.linear_model import Ridge
from sklearn.linear_model import RandomizedLasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse



## Load and verify Data 

Load compiled raw data that include all features assumed to impact days on market as well as features created to enable analysis and modeling. 

In [None]:
HousingData =pd.read_csv("Raw_Data_W.O_school_rank.csv")
HousingData_DF=pd.DataFrame(HousingData)
HousingData_DF[1:3]

In [None]:
print ("Total Instance and features compiled on raw data")
print (HousingData_DF.shape)
print (HousingData_DF.columns)



# # Further wrangling data for anlaysis 

## Label Encoding

In [None]:
## Encoding MLID

#fit

MLID = LabelEncoder() 
MLID.fit(HousingData_DF.ML_ID)

# Transform
HousingData_DF['Encoded_MLID'] = (MLID.transform(HousingData_DF.ML_ID))


HousingData_DF.head()

In [None]:
#drop some columns you probably won't use for analysis 
HousingData_Analysis=HousingData_DF.drop(['TotalTaxes2','Stat','Address','ML_ID'],1)
HousingData_Analysis[1:3]



In [None]:
#Convert the list date, month and year to date 
from datetime import datetime
HousingData_Analysis['ListDate']=pd.to_datetime(HousingData_Analysis['ListDate'], format='%Y%m%d')
HousingData_Analysis['IndexYear_ListDate'] = pd.to_datetime(HousingData_Analysis['IndexYear_ListDate'])
HousingData_Analysis['IndexMonth'] = pd.to_datetime(HousingData_Analysis['IndexMonth'])
HousingData_Analysis['IndexMonth_CloseDate'] = pd.to_datetime(HousingData_Analysis['IndexMonth_CloseDate'])
HousingData_Analysis['IndexYear_CloseDate']= pd.to_datetime(HousingData_Analysis['IndexYear_CloseDate'])
HousingData_Analysis['CloseDate'] = pd.to_datetime(HousingData_Analysis['CloseDate'])
HousingData_Analysis['IndexYear_No_ListDate'] = pd.to_datetime(HousingData_Analysis['IndexYear_No_ListDate'])
HousingData_Analysis['IndexMonth_No_CloseDate'] = pd.to_datetime(HousingData_Analysis['IndexMonth_No_CloseDate'])
HousingData_Analysis.info()

## Initial Data Analysis 

In [None]:
plt.scatter(HousingData_Analysis['ListDate'], HousingData_Analysis['DOMP'])

In [None]:
#Plotting DOMP over list date 
import plotly.plotly as py
import plotly.graph_objs as go

x = datetime(year=HousingData_Analysis['IndexYear_ListDate'], month= HousingData_Analysis['IndexMonth']
,day= HousingData_Analysis['ListDate'])
y=HousingData_Analysis['DOMP']
data = [go.Scatter(x=x,y=HousingData_Analysis['DOMP'])]
py.iplot(data)
#Plotting close price over list dates



In [None]:
#Summary Statistics 
HousingData_Analysis.describe()

In [None]:
# Histogram of all features
HousingData.hist(figsize=(15,20))

In [None]:
#Scatter plot of certain features 

Data = HousingData[['ListPrice','ClosePrice','YearBuilt','DOMP','Zip','MedianValuePerSqft', 'Turnover','IndexMonth']]
scatter_matrix(Data, alpha=0.2, figsize=(18,18), diagonal='kde')

In [None]:
#Pair plot of some of the features 

sns.pairplot(data = HousingData[['ListPrice','ClosePrice','DOMP','Zip','MedianValuePerSqft', 'Turnover','IndexMonth']],dropna=True)


In [None]:
#Correlation Matrix 
HousingData_CorrelationMatrix = pd.DataFrame(HousingData.corr())
print(HousingData_CorrelationMatrix)

From the above correlation matrix we can see that list price has high postive correlation with close price. DOMP don't seem to have any correlation with any features. 

In [None]:
#Covariance Matrix 
HousingData_CovarianceMatrix = pd.DataFrame(HousingData.cov())
print(HousingData_CovarianceMatrix)

In [None]:
#Create Correlation Matrix plot 

# Generate a mask for the upper triangle
mask = np.zeros_like(HousingData_CorrelationMatrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(HousingData_CovarianceMatrix, mask=mask, cmap=cmap, vmax=.3,
            square=True, xticklabels=5, yticklabels=5,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

In [None]:
#Box plot of DOMP per zip code

g = sns.FacetGrid(HousingData, col='Zip', size=4, aspect=.8)
g = g.map(sns.boxplot, 'DOMP', 'ClosePrice')
sns.plt.show()