# Contents

This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.

1. [EDA](#1)
2. [Linear Regression](#2)
3. [t-Test](#3)
4. [Random Forest](#4)

In [1]:
#import libraries

# linear algebra
import numpy as np 
# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd 
#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 
#vizualisation
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly.express as px
#maschine learning libraries
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  accuracy_score
from sklearn.ensemble import RandomForestRegressor
#t-test
from scipy.stats import ttest_ind # just the t-test from scipy.stats
from scipy.stats import probplot # for a qqplot
import pylab
from scipy.stats import t
import scipy.stats as stats
from statsmodels.stats import weightstats as statsmodelsweightstats

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



<a id = '1'></a>
# 1.EDA

In [1]:
# read data
df = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

In [1]:
df.head()

I think date column is useless for my operations , but i want to keep the years of houses were sold.I want to create new column, which is sal_year.

In [1]:
df['sal_year'] = df.date.str[:4]

In [1]:
df.drop(columns=['date'],inplace=True)

In [1]:
df.sal_year.value_counts()

## Columns of Data
1. id = Identify number
1. price = House price in dollar
1. bedrooms = Count of bedrooms
1. bathrooms = Count of bathrooms
1. sqft_living = Living space
1. sqft_lot = Square footage of house on land
1. floors= Count of floors
1. waterfronst = House on the seaside or not (1/0)
1. view = View point of house (0 - 4)
1. condition = Conditions point of house (0 - 5)
1. grade = Point of house (1 - 13)
1. sqft_above = Square footage of the above ground
1. sqft_basement = Square footage of the below ground
1. yr_built = The year of Hous was build
1. yr_renovated = The year of Hous was renovate
1. zipcode = Zipcode of house
1. lat = Lattitude
1. long = Longitude
1. sqft_living15 = Living space in houses,they were sold in 2015
1. sal_year = The year house was sold
1. sqft_lot15 = Square footage of house on land,the houses sold in 2015

In [1]:
df.info()

In [1]:
df.price.astype(float)

In [1]:
df.sqft_living.astype(float)

In [1]:
df.describe()

## Corelation

In [1]:
df.corr()
plt.subplots(figsize=(17,14))
sns.heatmap(df.corr(),annot=True,linewidths=0.5,linecolor="Black",fmt="1.1f")
plt.title("Data Correlation",fontsize=50)
plt.show()

## Vizualisations

## Bathroom Vizualisation

In [1]:
plt.figure(figsize=(10,10))
plt.scatter(x=df.bathrooms,y=df.price,color='skyblue', alpha=0.5)
plt.xlabel("Count of Bathrooms",)
plt.ylabel("Price of House")
plt.title("  Price ratio according to the number of bathrooms")
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.show()

## Grade Vizualisation

In [1]:
new_index = (df['grade'].sort_values(ascending=False)).index.values
sorted_data = df.reindex(new_index)


In [1]:
#Vizualition
plt.figure(figsize = (15,10))
sns.barplot(x=sorted_data['grade'],y=sorted_data['price'])
plt.ylabel('Price')
plt.xlabel('Grade')
plt.title('Price for each Grade')
plt.show()

## Condution Vizualisation

In [1]:
neww_index = (df['condition'].sort_values(ascending=False)).index.values
sortedw_data = df.reindex(neww_index)


In [1]:
#Vizualition
plt.figure(figsize = (15,10))
sns.barplot(x=sortedw_data['condition'],y=sortedw_data['price'])

plt.ylabel('Price')
plt.xlabel('Condition')
plt.title('Price for each Condition')
plt.show()

## View Vizualisation

In [1]:
vieww_index = (df['condition'].sort_values(ascending=False)).index.values
sortedview_data = df.reindex(vieww_index)

In [1]:
#Vizualition
plt.figure(figsize = (15,10))
sns.barplot(x=sortedview_data['view'],y=sortedview_data['price'])
plt.ylabel('Price')
plt.xlabel('View')
plt.title('Price for each View')
plt.show()

## Vizualisation - Relationship between Living Space and Living Space without Garage or Basement

I want to know Relationship between Living Space and Living Space without Garage or Basement.

In [1]:
dff = df.iloc[:,:]

trace1 = go.Scatter(
        x = dff.price,
        y = dff.sqft_above,
        mode = 'markers',
        name = 'sqft_above',
        marker = dict(color = 'rgba(123,123,3,0.7)'),
        text = dff.id)
trace2 = go.Scatter(
        x = dff.price,
        y = dff.sqft_living,
        mode = 'markers',
        name = 'sqft_living',
        marker = dict(color='rgba(10,133,1,0.7)'),
        text = dff.id)
data = [trace1,trace2]
layout = dict(title = 'Relationship between Living Space and Living Space without Garage or Basement',
              xaxis = dict(title = 'Price',ticklen = 5,zeroline = False),
              yaxis = dict(ticklen = 5,zeroline = False))
fig = dict(data = data, layout= layout)
py.iplot(fig)


## Map Vizualisation

In [1]:
data = df
#set colors
data["color"] = ""
data.color[data.grade == 1] = "rgb(255,255,255)"
data.color[data.grade == 2] = "rgb(220,220,220)"
data.color[data.grade == 3] = "rgb(242, 177, 172)"
data.color[data.grade == 4] = "rgb(255,133,27)"
data.color[data.grade == 5] = "rgb(255,255,204)"
data.color[data.grade == 6] = "rgb(255,65,54)"
data.color[data.grade == 7] = "rgb(1,2,123)"
data.color[data.grade == 8] = "rgb(123,123,2)"
data.color[data.grade == 9] = "rgb(45,188,1)"
data.color[data.grade == 10] = "rgb(31,51,6)"
data.color[data.grade == 11] = "rgb(245,10,126)"
data.color[data.grade == 12] = "rgb(8,187,180)"
data.color[data.grade == 13] = "rgb(250,250,250)"

In [1]:
#slice +7 grade
dataplus = data[np.logical_and(data.grade >= 7,data.yr_built >= 2000)] 
#list lat and long
lats = list(dataplus.lat.values)
longs = list(dataplus.long.values)

In [1]:
mapbox_access_token = 'pk.eyJ1IjoiZGFya2NvcmUiLCJhIjoiY2pscGFheHA1MXdqdjNwbmR3c290MTZ6dCJ9.K1FMv_q3ZVlKP13RrjFkjg'

mapp = [go.Scattermapbox(lat=lats,lon=longs,mode="markers",marker=dict(size=4.5,color=dataplus["color"]) ,hoverinfo="text",text="Grade:"+dataplus.grade.apply(str)+" Built Year:"+dataplus.yr_built.apply(str)+" Price:"+dataplus.price.apply(str))]

layout5 = dict(title="Grade(+7) - Built Year(+2000) Map",width=800,height=750,hovermode="closest",mapbox=dict(bearing=0,pitch=0,zoom=9,center=dict(lat=47.5,lon=-122.161),accesstoken=mapbox_access_token))

fig5 = go.Figure(data=mapp,layout=layout5)

py.iplot(fig5)

<a id = '2'></a>
# 2.Linear Regression


I want to do prediction of Price vs Living Space(sqft_living)

In [1]:
#call method and fit
linear_reg = LinearRegression()
y = df.price.values.reshape(-1,1)
x = df.sqft_living.values.reshape(-1,1)
linear_reg.fit(x,y)

# if we want to buy a house with 2000 Squarefootage of living space
linear_reg.predict([[2000]])

In [1]:
#lineare score

X = df[['sqft_living15']]
y = df.price.values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.3,random_state=1)

modelLR = LinearRegression()

#Fit
modelLR.fit(X_train, y_train)

#Predict
Y_pred = modelLR.predict(X_test)

modelLR.score(X_test,y_test)

In [1]:
plt.figure(figsize=(10,10))
plt.scatter(x,y,color='blue',label="Data", alpha=.1)
plt.plot(x,linear_reg.predict(x),color="yellow",label="Predicted Regression Line")
plt.xlabel("Living Space (sqft_living)", fontsize=15)
plt.ylabel("Price ($)", fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.legend()

plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

<a id = '3'></a>
# 3.t-Test

I think the price of houses,which are sold in 2014 and 2015 the same.Let's test it!

H0: avarage price of 2014 != avarage price of 2015 <br>
H1: avarage price of 2014 = avarage price of 2015

In [1]:
new_sal_year =df.sal_year.astype(int)

In [1]:
df.drop(columns=['sal_year'],inplace=True)

In [1]:
df['sales_year'] = new_sal_year

In [1]:
df.info()

In [1]:
#define the values of columns
first = df["price"][df["sales_year"] == 2014]
second = df["price"][df["sales_year"] == 2015]

In [1]:
#vizualisation
plt.figure(figsize=(10, 7))
sns.distplot(first, color='crimson')
plt.title("Distribution of House Prices sold in 2014", y=1.015, fontsize=22)
plt.xlabel("price of house [$]", labelpad=14)
plt.ylabel("count of occurences", labelpad=14)
plt.show()

In [1]:
#vizualisation
plt.figure(figsize=(10, 7))
sns.distplot(second, color='crimson')
plt.title("Distribution of House Prices in 2015", y=1.015, fontsize=22)
plt.xlabel("price of house [$]", labelpad=14)
plt.ylabel("count of occurences", labelpad=14)
plt.show()

In [1]:
#boxplot
plt.figure(figsize=(9, 5))
sns.boxplot(first, color='crimson', saturation=0.9)
plt.title("Distribution of House Prices in 2014", y=1.015)
plt.xlabel("House Prices", labelpad=14)
plt.show()

In [1]:
#boxplot
plt.figure(figsize=(9, 5))
sns.boxplot(second, color='crimson', saturation=0.9)
plt.title("Distribution of House Prices in 2015", y=1.015)
plt.xlabel("House Prices", labelpad=14);

In [1]:
probplot(df["price"], dist="norm", plot=pylab)

In [1]:
df.sales_year.value_counts()

In [1]:
nearsea_observations = len(first)
farsea_observations = len(second)
degrees_of_freedom = nearsea_observations + farsea_observations - 2
degrees_of_freedom
alpha = 0.05
two_tailed_test_prob_tail = alpha/2
t_critical = round(stats.t.ppf(two_tailed_test_prob_tail, degrees_of_freedom), 3)
print('point of t critical is: ',t_critical)

In [1]:

first = df["price"][df["sales_year"] == 2014]
second = df["price"][df["sales_year"] == 2015]

# compare them
ttest_ind(first, second, equal_var=False)

In [1]:
# let's look at the means (averages) of each group to see which is larger
print("Mean price for 2014:")
print(first.mean())

print("Mean price for 2015:")
print(second.mean())


In [1]:
# plot the house far from seaside
plt.figure(figsize =(10,10))
plt.hist(first, alpha=0.5, label='2014')
# and the house on the seaside
plt.hist(second, label='2015')
# and add a legend
plt.legend(loc='upper right')
# add a title
plt.title("count of houses sold in 2014 and 2015")
plt.show()

The results show that;
We accept the H0 challenge,thath means the value is in confidence interval.


<a id ='4'></a>
## 4.Random Forest

In [1]:
#select the columns 
x = df.iloc[:,10].values.reshape(-1,1)
y = df.iloc[:,1].values.reshape(-1,1)

#call method and fit

rf =  RandomForestRegressor(n_estimators=100,random_state=42)
rf.fit(x,y)

print('Price prediction at 7.8 point of Grade: ',rf.predict([[7.8]]))
#normalizing
x_ = np.arange(min(x),max(x),0.1).reshape(-1,1)
y_head = rf.predict(x_)

# visualize
plt.figure(figsize=(10,10))
plt.scatter(x,y,color="red",alpha=0.03)
plt.plot(x_,y_head,color="green")
plt.xlabel("grade")
plt.ylabel("price")
plt.show()

Let's find the accuracy score

In [1]:
#drop useless columns
df.drop(['id','color'],axis=1,inplace=True)

In [1]:
# 80-train , 20-test
x = df.values
y = df["grade"]
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=2698)

In [1]:
#the shapes
print("x_train Shape : ", x_train.shape)
print("X_test Shape  : ", x_test.shape)
print("y_train Shape : ", y_train.shape)
print("y_test Shape  : ", y_test.shape)

In [1]:
#using rf method
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(x_train, y_train)
pred=rf.predict(x_test)

In [1]:
print("Accuracy of RandomForestClassifier is /Train set: ",rf.score(x_train,y_train))
print("Accuracy of RandomForestClassifier is /Test set : ",rf.score(x_test,y_test))

This kernel is my second step to be a Data Scientist.I would like to hear your recommendations .

In [1]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=10)

In [1]:
top5xrf = X_train.head(5)
top5yrf = y_train.head(5)

In [1]:
rf_model.fit(top5xrf, top5yrf)

In [1]:
estimator = rf_model.estimators_[5]
estimator1 = rf_model.estimators_[6]

In [1]:
!pip install --upgrade scikit-learn==0.20.3

In [1]:
pip install pydotplus

In [1]:
from sklearn.externals.six import StringIO  
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data1 = StringIO()
export_graphviz(estimator, out_file=dot_data1,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data1.getvalue())  
Image(graph.create_png())

In [1]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data3 = StringIO()
export_graphviz(estimator1, out_file=dot_data3,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data3.getvalue())  
Image(graph.create_png())