In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import  train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


## 1. EDA

- Check Missing values
- Check Duplicates
- Check data type
- Check the number of unique values in each column
- Check statistics of the data set
- Check the correlation column then heatmap

In [61]:
house_df = pd.read_csv("/content/data.csv")
df = house_df.copy()

In [62]:
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [63]:
df.sample(2)

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
1627,2014-05-30 00:00:00,399000.0,2.0,1.0,940,4800,1.0,0,0,4,940,0,1911,1955,3209 63rd Ave SW,Seattle,WA 98116,USA
251,2014-05-07 00:00:00,314500.0,3.0,1.75,1870,12381,1.0,0,0,4,1870,0,1957,2001,29645 10th Pl S,Federal Way,WA 98003,USA


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [65]:
numeric_features = [feature for feature in house_df.columns if house_df[feature].dtype != 'O']
categorical_features = [feature for feature in house_df.columns if house_df[feature].dtype == 'O']

print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 13 numerical features : ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']

We have 5 categorical features : ['date', 'street', 'city', 'statezip', 'country']


## I) **Check for null items**

In [66]:
house_df.isnull().sum()

Unnamed: 0,0
date,0
price,0
bedrooms,0
bathrooms,0
sqft_living,0
sqft_lot,0
floors,0
waterfront,0
view,0
condition,0


In [67]:
house_df.duplicated().sum() # in rows

0

In [68]:
df.nunique() # total unique values in each features

Unnamed: 0,0
date,70
price,1741
bedrooms,10
bathrooms,26
sqft_living,566
sqft_lot,3113
floors,6
waterfront,2
view,5
condition,5


In [73]:
df[["bedrooms","bathrooms","floors"]]=df[["bedrooms","bathrooms","floors"]].astype(int)
df["date"]=pd.to_datetime(df["date"]).dt.date
df[["city","statezip","country"]]=df[["city","statezip","country"]].astype("category") #streets has too many unique values cant convert to category

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   date           4600 non-null   object  
 1   price          4600 non-null   float64 
 2   bedrooms       4600 non-null   int64   
 3   bathrooms      4600 non-null   int64   
 4   sqft_living    4600 non-null   int64   
 5   sqft_lot       4600 non-null   int64   
 6   floors         4600 non-null   int64   
 7   waterfront     4600 non-null   int64   
 8   view           4600 non-null   int64   
 9   condition      4600 non-null   int64   
 10  sqft_above     4600 non-null   int64   
 11  sqft_basement  4600 non-null   int64   
 12  yr_built       4600 non-null   int64   
 13  yr_renovated   4600 non-null   int64   
 14  street         4600 non-null   object  
 15  city           4600 non-null   category
 16  statezip       4600 non-null   category
 17  country        4600 non-null   ca

In [75]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,4600.0,551962.988473,563834.702547,0.0,322875.0,460943.461539,654962.5,26590000.0
bedrooms,4600.0,3.40087,0.908848,0.0,3.0,3.0,4.0,9.0
bathrooms,4600.0,1.788913,0.752185,0.0,1.0,2.0,2.0,8.0
sqft_living,4600.0,2139.346957,963.206916,370.0,1460.0,1980.0,2620.0,13540.0
sqft_lot,4600.0,14852.516087,35884.436145,638.0,5000.75,7683.0,11001.25,1074218.0
floors,4600.0,1.45913,0.552194,1.0,1.0,1.0,2.0,3.0
waterfront,4600.0,0.007174,0.084404,0.0,0.0,0.0,0.0,1.0
view,4600.0,0.240652,0.778405,0.0,0.0,0.0,0.0,4.0
condition,4600.0,3.451739,0.67723,1.0,3.0,3.0,4.0,5.0
sqft_above,4600.0,1827.265435,862.168977,370.0,1190.0,1590.0,2300.0,9410.0


In [76]:
numeric_features = [feature for feature in house_df.columns if house_df[feature].dtype not in ['O', 'category']]
categorical_features = [feature for feature in house_df.columns if house_df[feature].dtype in ['O', 'category']]

print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 13 numerical features : ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']

We have 5 categorical features : ['date', 'street', 'city', 'statezip', 'country']
