In [1]:
%matplotlib inline
# all the basic libraries that we need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(style= 'darkgrid')

# pre-processing method
from sklearn.model_selection import train_test_split

# the regression models 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# methods and classes for evaluation
from sklearn import metrics
from math import sqrt
from sklearn.model_selection import cross_validate
import time

In [2]:
caps_df = pd.read_csv('innercity.csv')

In [3]:
caps_df.head()

Unnamed: 0,cid,dayhours,price,room_bed,room_bath,living_measure,lot_measure,ceil,coast,sight,...,basement,yr_built,yr_renovated,zipcode,lat,long,living_measure15,lot_measure15,furnished,total_area
0,3034200666,20141107T000000,808100,4,3.25,3020,13457,1.0,0,0,...,0,1956,0,98133,47.7174,-122.336,2120,7553,1,16477
1,8731981640,20141204T000000,277500,4,2.5,2550,7500,1.0,0,0,...,800,1976,0,98023,47.3165,-122.386,2260,8800,0,10050
2,5104530220,20150420T000000,404000,3,2.5,2370,4324,2.0,0,0,...,0,2006,0,98038,47.3515,-121.999,2370,4348,0,6694
3,6145600285,20140529T000000,300000,2,1.0,820,3844,1.0,0,0,...,0,1916,0,98133,47.7049,-122.349,1520,3844,0,4664
4,8924100111,20150424T000000,699000,2,1.5,1400,4050,1.0,0,0,...,0,1954,0,98115,47.6768,-122.269,1900,5940,0,5450


In [4]:
caps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cid               21613 non-null  int64  
 1   dayhours          21613 non-null  object 
 2   price             21613 non-null  int64  
 3   room_bed          21613 non-null  int64  
 4   room_bath         21613 non-null  float64
 5   living_measure    21613 non-null  int64  
 6   lot_measure       21613 non-null  int64  
 7   ceil              21613 non-null  float64
 8   coast             21613 non-null  int64  
 9   sight             21613 non-null  int64  
 10  condition         21613 non-null  int64  
 11  quality           21613 non-null  int64  
 12  ceil_measure      21613 non-null  int64  
 13  basement          21613 non-null  int64  
 14  yr_built          21613 non-null  int64  
 15  yr_renovated      21613 non-null  int64  
 16  zipcode           21613 non-null  int64 

In [5]:
caps_df.isna().sum()

cid                 0
dayhours            0
price               0
room_bed            0
room_bath           0
living_measure      0
lot_measure         0
ceil                0
coast               0
sight               0
condition           0
quality             0
ceil_measure        0
basement            0
yr_built            0
yr_renovated        0
zipcode             0
lat                 0
long                0
living_measure15    0
lot_measure15       0
furnished           0
total_area          0
dtype: int64

In [6]:
caps_df.shape

(21613, 23)

In [7]:
caps_df.dtypes

cid                   int64
dayhours             object
price                 int64
room_bed              int64
room_bath           float64
living_measure        int64
lot_measure           int64
ceil                float64
coast                 int64
sight                 int64
condition             int64
quality               int64
ceil_measure          int64
basement              int64
yr_built              int64
yr_renovated          int64
zipcode               int64
lat                 float64
long                float64
living_measure15      int64
lot_measure15         int64
furnished             int64
total_area            int64
dtype: object

In [8]:
for value in ['cid','dayhours','price','room_bed','room_bath','living_measure','lot_measure','ceil','coast','sight','condition','quality','ceil_measure','basement','yr_built','yr_renovated','zipcode','lat','long','living_measure15','lot_measure15','furnished','total_area']:
    print(value,":",sum(caps_df[value] == '?'))

cid : 0
dayhours : 0
price : 0
room_bed : 0
room_bath : 0
living_measure : 0
lot_measure : 0
ceil : 0
coast : 0
sight : 0
condition : 0
quality : 0
ceil_measure : 0
basement : 0
yr_built : 0
yr_renovated : 0
zipcode : 0
lat : 0
long : 0
living_measure15 : 0
lot_measure15 : 0
furnished : 0
total_area : 0


In [9]:
caps_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cid,21613.0,4580302000.0,2876566000.0,1000102.0,2123049000.0,3904930000.0,7308900000.0,9900000000.0
price,21613.0,540182.2,367362.2,75000.0,321950.0,450000.0,645000.0,7700000.0
room_bed,21613.0,3.370842,0.9300618,0.0,3.0,3.0,4.0,33.0
room_bath,21613.0,2.114757,0.7701632,0.0,1.75,2.25,2.5,8.0
living_measure,21613.0,2079.9,918.4409,290.0,1427.0,1910.0,2550.0,13540.0
lot_measure,21613.0,15106.97,41420.51,520.0,5040.0,7618.0,10688.0,1651359.0
ceil,21613.0,1.494309,0.5399889,1.0,1.0,1.5,2.0,3.5
coast,21613.0,0.007541757,0.0865172,0.0,0.0,0.0,0.0,1.0
sight,21613.0,0.2343034,0.7663176,0.0,0.0,0.0,0.0,4.0
condition,21613.0,3.40943,0.650743,1.0,3.0,3.0,4.0,5.0
