## Intro


I'm new to data science and this is my first project on kaggle. Looking forward for all types of feedback, advice, comments ...

## Import and preparation of data

###  Importing Libraries

In [None]:
import pandas as pd # data processing
import matplotlib.pyplot as plt # basic plotting 
import numpy as np # linear algebra
import geopandas as gpd # geospatial data
from scipy import stats as st # probability distributions and statistical functions
import seaborn as sns #data visualization library based on matplotlib
import folium #interactive leaflet map

from folium.plugins import FloatImage

from matplotlib.colors import ListedColormap
from shapely.geometry import Point
from sklearn import preprocessing


%matplotlib inline 
plt.style.use('ggplot') # use ggplot style
sns.set(style='whitegrid', palette='pastel', color_codes=True) 
sns.mpl.rc('figure', figsize=(30,15))


## Reading & Understanding the Data 


In [None]:
# read in the data from the provided csv file
data = pd.read_csv("../input/housesalesprediction/kc_house_data.csv")
data.head(5)

### Columns Descriptions
* **date**- Date of the home sale.
* **price** - House sale price.
* **bedrooms** - Number of bedrooms.
* **bathrooms** - Number of bathrooms. (.5 accounts for a room with a toilet but no shower).
* **sqft_living** - Square footage of the apartments interior living space.
* **sqft_lot** - Square footage of the land space.
* **floors** - Number of floors.
* **waterfront** - Whether the apartment overlooking the seafront or not.
* **view** -  0 to 4 index of how good the view of the property is.
* **condition** - 1 to 5 index of the condition of the apartment.
* **grade** - 1 to 13 index of the level of construction and design.
* **sqft_above** - The area in square feet of the interior space above ground level .
* **sqft_basement** - The area in square feet of the interior space below ground level.
* **yr_built** - The year of construction of the house.
* **yr_renovated** - The year of the house’s last renovation.
* **zipcode** - the zipcode area in which the house is located.
* **lat** - Lattitude.
* **long** - Longitude.
* **sqft_living15** - The square footage of interior housing living space for the nearest 15 neighbors.
* **sqft_lot15** - The square footage of the land lots of the nearest 15 neighbors.
.

In [None]:
#shape command will give number of rows/samples/examples and number of columns/features/predictors in dataset
#(rows,columns)
data.shape

### Main statistics of each parameters

In [None]:
#  Describe gives statistical information about numerical columns in the dataset
data.describe(include='all')

## Cleaning the data 

### Checking for missing values

In [None]:
# missing values
data.isnull().sum()

Apparently we have no missing values. Yay, me.

### Removing outliers 

When I applied the first method to exclude outliers, many fields were lost e.g : yr_renovated and view. Due to the extreme skewness of the data distribution. Therefore, I had to target a specific field to exclude the outliers of, Using the second method.

In [None]:
## First method
## Before deleting outliers length =  21613
## After deleting outliers length =  14934
# print("Before deleting outliers length = " , len(data))   
# Q1 = data.quantile(0.25)
# Q3 = data.quantile(0.75)
# IQR = Q3 - Q1
# data = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
# print("After deleting outliers length = " ,  len(data)) 

## Second method
## Before deleting outliers length =  21613
## After deleting outliers length =  20770

print("Before deleting outliers length = " , len(data))                
target = data['price']
target_mean = target.mean()
target_sd = target.std()
data = data[(target > target_mean - 2*target_sd) & (target < target_mean + 2*target_sd)]
print("After deleting outliers length = " ,  len(data)) 



## Data visualization

In [None]:
# Dividing columns into numerical and categorical
numerical_columns   = ['price','sqft_living','sqft_living','sqft_lot','sqft_above','sqft_basement','sqft_living15','sqft_lot15']
categorical_columns = ['bedrooms','bathrooms','floors','waterfront','view','condition','yr_built','yr_renovated','zipcode']

numerical_features   = data[numerical_columns]
categorical_features = data[categorical_columns]

### Distribution of Data

In [None]:
fig = plt.figure(figsize=(25, 10))
for i in range(len(numerical_features.columns)):
    fig.add_subplot(2,4,i+1)
    sns.distplot(numerical_features.iloc[:,i].dropna(), rug=True, hist=True, label='UW', kde_kws={'bw':1})
    plt.xlabel(numerical_features.columns[i])
plt.tight_layout()
plt.show()

###  Univariate Analysis

In [None]:
fig = plt.figure(figsize=(25,40))
for i in range(len(numerical_features.columns)):
    fig.add_subplot(4,2,i+1)
    sns.boxplot( y=numerical_features.iloc[:,i])

plt.tight_layout()
plt.show()


###  Bivariate Analysis

In [None]:

fig = plt.figure(figsize=(30,40))
for i in range(len(numerical_features.columns)):
    fig.add_subplot(8,2,i+1)
    sns.regplot(numerical_features.iloc[:, i],data['price'],line_kws={"color": "red"})
plt.tight_layout()
plt.show()


In [None]:
data.hist(bins=10,figsize=(25,15) , column = categorical_columns)
;

## Descriptive statistics

## Associations and Correlations between Variables

### Correlation Matrix

In [None]:
#correlation matrix Heatmap
ax = plt.axes()
df = pd.DataFrame(data,columns=pd.read_csv('../input/housesalesprediction/kc_house_data.csv', nrows=1).columns.tolist())
corrMatrix = df.corr()
sns.heatmap(corrMatrix, annot=True , fmt=".3f", vmin=0, cmap=sns.cm.rocket_r,annot_kws={"size": 18},  linewidths=.1,)
ax.set_title('Correlation matrix Heatmap',fontsize=25)
plt.show()



In [None]:
# most correlated features Heatmap
ax = plt.axes()
mostCorrMatrix = data.corr()
top_corr_features = mostCorrMatrix.index[abs(mostCorrMatrix["price"])>0.5]
sns.heatmap(df[top_corr_features].corr(),annot=True ,fmt=".3f" ,vmin=0 ,cmap=sns.cm.rocket_r ,annot_kws={"size": 25},  linewidths=.1, )
sns.set(font_scale = 1)
ax.set_title('Most correlated features Heatmap',fontsize=25)
plt.show()

**bathrooms**, **sqft_living**, **grade**, **sqft_above**, **sqft_living15**, have more than 0.5 correlation with **price**.


In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
column_sels = [ 'bathrooms', 'sqft_living', 'grade', 'sqft_above','sqft_living15']
y = data['price']
x = pd.DataFrame(data=min_max_scaler.fit_transform(data.loc[:,column_sels]), columns=column_sels)
fig, axs = plt.subplots(ncols=5, nrows=1, figsize=(25, 5))
index = 0
axs = axs.flatten()
for i, k in enumerate(column_sels):
    sns.regplot(y=y, x=x[k], ax=axs[i],line_kws={"color": "red"})
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

### plotting the relation between price and space

In [None]:
# Create data
sqft_living = data['sqft_living']
price =  data['price']

#regression
r = st.linregress(sqft_living,price)
plt.plot(sqft_living, r.intercept + r.slope*sqft_living, 'r', label='fitted line' ,c='#4a70e1')

# Plot
plt.scatter(sqft_living, price, s=10, c='#d50a59', alpha=0.5)
plt.title('Scatter plot pythonspot.com')
plt.xlabel('sqft_living')
plt.ylabel('price')
plt.show()

### Which area have the most expensive houses on sale. And the number of sales.

In [None]:
box_ax = df.boxplot(vert=False ,column='price',figsize=(25,70) , by='zipcode', grid=True, rot=90   ,showfliers=False , fontsize='15',  patch_artist = True )
# box_ax.set_ylim(-0e5, 1.5e6)
box_ax.set_yticklabels(['%s  (%d)'%(k, (v['id'])) for k, v in data.groupby('zipcode').count().iterrows() ] , rotation=0 )

plt.show()




## Distribution Maps

In [None]:
# import king county street map
street_map = gpd.read_file(r'../input/kingcountyshapefileroadmap/tl_2017_53033_roads.shp')

# designate coordinate system
crs = {'init':"EPSG:3857"}

# zip x and y coordinates into single feature
geometry = [Point(xy) for xy in zip(df['long'], df['lat'])]

# create GeoPandas dataframe
geo_df = gpd.GeoDataFrame(df,crs = crs,geometry = geometry)


# create figure and axes, assign to subplot
fig, ax = plt.subplots()

# add .shp mapfile to axes
street_map.plot(ax=ax, alpha=0.2,color='#CCCCCC')


# assign ‘price’ variable to represent coordinates on graph
my_cmap = ListedColormap(sns.color_palette(['#FFF5F0' ,'#FCBBA1' ,'#FB6A4A' ,'#CB181D' ,'#67000D']).as_hex())
geo_df.plot(column='price',ax=ax,   legend=True,   cmap=my_cmap , edgecolor='#EAEAF2' )

# add title to graph
plt.title('House Prices in  King County, USA', fontsize=15,fontweight='bold' )

# set latitiude and longitude boundaries for map display
plt.xlim(-122.6 ,-121.06)
plt.ylim( 47.06 ,47.8 )

# show map
plt.show()

### house sales by zip code on the map

In [None]:
#add heatmao scall to map 

mymap = folium.Map(location= [47.56 , -122.22], zoom_start =9) 

# Get the highest average house price
maxave = int(data.groupby(['zipcode']).mean()['price'].max())
print("Highest City House Price is: ", maxave)

# Create a color map to match house prices. White - low price, Black - high price
colormap = ['#00009B' ,'#0099FF' ,'#63FF9B' ,'#FFFB00' ,'#FF2F00' , '#8B0000']

# Add marker info 

for index, row in data.groupby('zipcode').agg({'lat': 'mean','long': 'mean', 'price':'mean', 'zipcode':'count' }).iterrows(): 
    # Set icon color based on price 
    theCol = colormap[ int((len(colormap) - 1 ) *  float( row['price']) / maxave) ]
    markerText =  ( 'Average price : ' + str(round(row['price'], 2) ) +' $' + '\n' + 'Houses sold : ' + str(row['zipcode']) )
    folium.CircleMarker( radius=(row['zipcode'])/15, location=[row['lat'],row['long']], popup=folium.Popup(markerText,max_width=150,min_width=150), color= theCol,fill=True,fill_color=theCol,  ).add_to(mymap)

FloatImage('https://lh3.googleusercontent.com/proxy/SRXTqZngcyOscx1nR1iB9c4IobPtOn0cEROsZ_wK6CO3nfAjD4e4TDXPDjN3AU2ZLJxzJQaoLJnlqk9zZevN7S2wZZctQysIIKOvigpGatc', bottom=0, left=65).add_to(mymap)
mymap