In [53]:
#import packages to be used for data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

In [54]:
#read in dataset into pandas
df = pd.read_csv('housesalesprediction/kc_house_data.csv')

In [55]:
#inspect how many rows and columns are in datasest
df.shape

(21613, 21)

In [56]:
#view first couple rows to make sure columns and rows are labelled properly
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [57]:
#take a look at different types of values in columns
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
id               21613 non-null int64
date             21613 non-null object
price            21613 non-null float64
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB


In [58]:
#take a look at names of all the columns
print(df.columns)
columns_list = df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


In [59]:
#take a look at zip code column and sort out unique values to see if data is sufficient for king county predictions
all_zip_codes = df['zipcode']
zip_codes = df['zipcode'].unique()
np.sort(zip_codes)
len(zip_codes)

70

#### with 108 zip codes in the king county area, 70 provided in our dataset is a good representation of king county house price prediction analysis

In [60]:
#we can pick a couple features and see if there are any trends so we know which features to use in exploratory data analysis
subset = df[['price','bedrooms','bathrooms','sqft_living']]
subset.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living
0,221900.0,3,1.0,1180
1,538000.0,3,2.25,2570
2,180000.0,2,1.0,770
3,604000.0,4,3.0,1960
4,510000.0,3,2.0,1680


In [61]:
#row labels correspond to index labels on this data set
df.iloc[[0,56]]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
56,9478500640,20140819T000000,292500.0,4,2.5,2250,4495,2.0,0,0,...,7,2250,0,2008,0,98042,47.3663,-122.114,2250,4500


In [62]:
#filter the highest grade according to king county.gov/assesor buiding grade and compare against price
df.loc[df['grade'] == 13, ['grade', 'price']]

Unnamed: 0,grade,price
4411,13,5570000.0
4811,13,2479000.0
5451,13,1780000.0
6041,13,2385000.0
7035,13,3800000.0
7252,13,7700000.0
7907,13,3200000.0
9254,13,6885000.0
10373,13,2983000.0
13411,13,2415000.0


In [63]:
#can subset the highest buiding rating and condition rating in accordance to king county.gov/assessor
df.loc[(df['grade'] == 13) & (df['condition'] > 3), ['grade', 'condition', 'price']]

Unnamed: 0,grade,condition,price
7035,13,4,3800000.0
7252,13,4,7700000.0


In [65]:
#check to see if any missing values
df.isnull().sum()
df.isnull().sum().sum()

0

### after doing data exploration decide if it is statistically important to add city names to data table as one of columns