# IMPORTS

In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Functions and Methods

## Loading Data

In [28]:
df_raw = pd.read_csv('kc_house_data.csv', parse_dates=['date'])

# DATA DESCRIPTION

## Data Preview

In [29]:
df_raw.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062


## Data Format and Types

In [30]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             21613 non-null  int64         
 1   date           21613 non-null  datetime64[ns]
 2   price          21613 non-null  float64       
 3   bedrooms       21613 non-null  int64         
 4   bathrooms      21613 non-null  float64       
 5   sqft_living    21613 non-null  int64         
 6   sqft_lot       21613 non-null  int64         
 7   floors         21613 non-null  float64       
 8   waterfront     21613 non-null  int64         
 9   view           21613 non-null  int64         
 10  condition      21613 non-null  int64         
 11  grade          21613 non-null  int64         
 12  sqft_above     21613 non-null  int64         
 13  sqft_basement  21613 non-null  int64         
 14  yr_built       21613 non-null  int64         
 15  yr_renovated   2161

## NA Check and Treatment

In [15]:
df_raw.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

## Discriptive Statistics

### Numerical Features

In [84]:
# numerical attributes
df_raw_num_att = df_raw.select_dtypes(include=['int64', 'float64'])

# getting skew
skew = pd.DataFrame(df_raw_num_att.apply(lambda x: x.skew())).round(2).T
skew.rename(index={0: 'skew'}, inplace=True)

# getting kurtosis
kurtosis = pd.DataFrame(df_raw_num_att.apply(lambda x: x.kurtosis())).round(2).T
kurtosis.rename(index={0: 'kurtosis'}, inplace=True)

# numerical features 
describe = df_raw.describe().round(2)
m = pd.concat([describe, skew, kurtosis], axis=0)
m.iloc[:,1:]

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,540088.14,3.37,2.11,2079.9,15106.97,1.49,0.01,0.23,3.41,7.66,1788.39,291.51,1971.01,84.4,98077.94,47.56,-122.21,1986.55,12768.46
std,367127.2,0.93,0.77,918.44,41420.51,0.54,0.09,0.77,0.65,1.18,828.09,442.58,29.37,401.68,53.51,0.14,0.14,685.39,27304.18
min,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.16,-122.52,399.0,651.0
25%,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.47,-122.33,1490.0,5100.0
50%,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.57,-122.23,1840.0,7620.0
75%,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.68,-122.12,2360.0,10083.0
max,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.78,-121.32,6210.0,871200.0
skew,4.02,1.97,0.51,1.47,13.06,0.62,11.39,3.4,1.03,0.77,1.45,1.58,-0.47,4.55,0.41,-0.49,0.89,1.11,9.51
kurtosis,34.59,49.06,1.28,5.24,285.08,-0.48,127.63,10.89,0.53,1.19,3.4,2.72,-0.66,18.7,-0.85,-0.68,1.05,1.6,150.76


### Categorical Features

# QUESTIONS & HYPOTESIS

**1.** On average, higher square footage of the land space leads to higher house prices?

**2.** On average, higher square footage of the interior living space leads to higher house prices?

**3.** The number of bathrooms and bedrooms affects the house prices?

**4.** On average, when is the best time (month or season) to sale and to buy a house?

**5.** sqft_above is more important then sqft_basement to determine the house price?

**6.** On average, 5 condition rated houses are 40% more expensive.

**7.** Houses that are waterfront are 50% more expensive.

**8.** The Year over Year increase in price is 5%.

**9.** After renovation house prices tends to increase its value by 10%.

**9.** For each grade level (construction and design) the house price increases 4%.

**10.** For each view level the house price increases 2%.

# DATAFRAME FILTERING

## Line Filtering

## Columns Selection

# FEATURE ENGINEERING

# HYPOTESIS RESOLUTION

# EXPLORATORY DATA ANALYSIS

## Univariate Analysis

### Response Variable

### Numerical Variable

### Categorical Variable

## Bivariate Analysis

## Multivariate Analysis

### Numerical Features

### Categorical Features

# OVERVIEW AND NEXT STEPS