In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,5/2/2014 0:00,313000.0,3.0,1.5,1340.0,7912.0,1.5,0.0,0.0,3,1340.0,0.0,1955.0,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,5/2/2014 0:00,2384000.0,5.0,2.5,3650.0,9050.0,2.0,0.0,4.0,5,3370.0,280.0,1921.0,0,709 W Blaine St,Seattle,WA 98119,USA
2,5/2/2014 0:00,342000.0,3.0,2.0,1930.0,11947.0,1.0,0.0,0.0,4,1930.0,0.0,1966.0,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,5/2/2014 0:00,420000.0,3.0,2.25,2000.0,8030.0,1.0,0.0,0.0,4,1000.0,1000.0,1963.0,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,5/2/2014 0:00,550000.0,4.0,2.5,1940.0,10500.0,1.0,0.0,0.0,4,1140.0,800.0,1976.0,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [3]:
df.tail()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
4595,7/9/2014 0:00,308166.6667,3.0,1.75,1510.0,6360.0,1.0,0.0,0.0,4,1510.0,0.0,1954.0,1979,501 N 143rd St,Seattle,WA 98133,USA
4596,7/9/2014 0:00,534333.3333,3.0,,1460.0,7573.0,2.0,0.0,0.0,3,1460.0,0.0,1983.0,2009,14855 SE 10th Pl,Bellevue,WA 98007,USA
4597,7/9/2014 0:00,416904.1667,3.0,,3010.0,7014.0,2.0,0.0,0.0,3,3010.0,0.0,2009.0,0,759 Ilwaco Pl NE,Renton,WA 98059,USA
4598,7/10/2014 0:00,203400.0,4.0,2.0,2090.0,6630.0,1.0,0.0,0.0,3,1070.0,1020.0,1974.0,0,5148 S Creston St,Seattle,WA 98178,USA
4599,7/10/2014 0:00,220600.0,3.0,2.5,1490.0,8102.0,2.0,0.0,0.0,4,1490.0,0.0,1990.0,0,18717 SE 258th St,Covington,WA 98042,USA


In [4]:
print(f"Number of rows {df.shape[0]}, Number of columns {df.shape[1]}")
print(df.shape)

Number of rows 4600, Number of columns 18
(4600, 18)


In [5]:
print("-- Attributes in Data --")
for cols in df.columns:
    print(cols)

-- Attributes in Data --
date
price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
sqft_above
sqft_basement
yr_built
yr_renovated
street
city
statezip
country


In [6]:
df.count()

date             4600
price            4597
bedrooms         4595
bathrooms        4598
sqft_living      4598
sqft_lot         4598
floors           4598
waterfront       4595
view             4598
condition        4600
sqft_above       4597
sqft_basement    4597
yr_built         4594
yr_renovated     4600
street           4597
city             4596
statezip         4597
country          4594
dtype: int64

In [7]:
df['city'].unique()

array(['Shoreline', 'Seattle', 'Kent', 'Bellevue', 'Redmond',
       'Maple Valley', 'North Bend', 'Lake Forest Park', 'Sammamish',
       'Auburn', 'Des Moines', 'Bothell', 'Federal Way', 'Kirkland',
       'Issaquah', 'Woodinville', 'Normandy Park', 'Fall City', 'Renton',
       'Carnation', 'Snoqualmie', 'Duvall', 'Burien', 'Covington',
       'Inglewood-Finn Hill', 'Kenmore', 'Newcastle', 'Mercer Island',
       'Black Diamond', 'Ravensdale', 'Clyde Hill', 'Algona', 'Skykomish',
       'Tukwila', 'Vashon', 'Yarrow Point', 'SeaTac', 'Medina',
       'Enumclaw', 'Snoqualmie Pass', 'Pacific', 'Beaux Arts Village',
       'Preston', 'Milton', nan], dtype=object)

In [8]:
print("-- Number of Unique Values in Data --")
print(df.nunique())

-- Number of Unique Values in Data --
date               70
price            1738
bedrooms           10
bathrooms          26
sqft_living       566
sqft_lot         3112
floors              6
waterfront          2
view                5
condition           5
sqft_above        511
sqft_basement     207
yr_built          115
yr_renovated       60
street           4522
city               44
statezip           77
country             1
dtype: int64


In [9]:
print("-- Number of Null Values in Data --")
print(df.isnull().sum())

-- Number of Null Values in Data --
date             0
price            3
bedrooms         5
bathrooms        2
sqft_living      2
sqft_lot         2
floors           2
waterfront       5
view             2
condition        0
sqft_above       3
sqft_basement    3
yr_built         6
yr_renovated     0
street           3
city             4
statezip         3
country          6
dtype: int64


In [10]:

print("-- Details of Data --")
df.describe()

-- Details of Data --


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
count,4597.0,4595.0,4598.0,4598.0,4598.0,4598.0,4595.0,4598.0,4600.0,4597.0,4597.0,4594.0,4600.0
mean,552119.0,3.401088,2.160668,2139.498913,14856.07,1.511853,0.007182,0.240757,3.451739,1827.509463,312.122036,1970.799086,808.608261
std,563983.4,0.908983,0.78392,963.387024,35891.77,0.537905,0.084449,0.778558,0.67723,862.13854,464.228562,29.740666,979.414536
min,0.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,0.0,1900.0,0.0
25%,323833.3,3.0,1.75,1460.0,5001.0,1.0,0.0,0.0,3.0,1190.0,0.0,1951.0,0.0
50%,461000.0,3.0,2.25,1980.0,7683.0,1.5,0.0,0.0,3.0,1590.0,0.0,1976.0,0.0
75%,655000.0,4.0,2.5,2620.0,11003.75,2.0,0.0,0.0,4.0,2300.0,610.0,1997.0,1999.0
max,26590000.0,9.0,8.0,13540.0,1074218.0,3.5,1.0,4.0,5.0,9410.0,4820.0,2014.0,2014.0


In [11]:
print("-- Insights of Data --")
df.info()

-- Insights of Data --
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4597 non-null   float64
 2   bedrooms       4595 non-null   float64
 3   bathrooms      4598 non-null   float64
 4   sqft_living    4598 non-null   float64
 5   sqft_lot       4598 non-null   float64
 6   floors         4598 non-null   float64
 7   waterfront     4595 non-null   float64
 8   view           4598 non-null   float64
 9   condition      4600 non-null   int64  
 10  sqft_above     4597 non-null   float64
 11  sqft_basement  4597 non-null   float64
 12  yr_built       4594 non-null   float64
 13  yr_renovated   4600 non-null   int64  
 14  street         4597 non-null   object 
 15  city           4596 non-null   object 
 16  statezip       4597 non-null   object 
 17  country        4594 non-null 

In [12]:
print("-- Number of Null Values in Data --")
print(df.isnull().sum())

-- Number of Null Values in Data --
date             0
price            3
bedrooms         5
bathrooms        2
sqft_living      2
sqft_lot         2
floors           2
waterfront       5
view             2
condition        0
sqft_above       3
sqft_basement    3
yr_built         6
yr_renovated     0
street           3
city             4
statezip         3
country          6
dtype: int64


In [13]:
def fillNaObjMode(col):
    for i in col:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = ['street', 'city', 'statezip', 'country']
fillNaObjMode(columns)

In [14]:
def fillNaMean(col):
    for i in col:
        df[i] = df[i].fillna(df[i].mean())

columns = ['price','sqft_living','sqft_lot','sqft_above','sqft_basement']
fillNaMean(columns)

In [15]:
def fillNaMode(col):
    for i in col:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = ['bedrooms','bathrooms','floors','waterfront','view','yr_built']
fillNaMode(columns)

In [16]:
df.drop('date', axis=1, inplace=True)
df.drop('street', axis=1, inplace=True)

In [17]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,statezip,country
0,313000.0,3.0,1.5,1340.0,7912.0,1.5,0.0,0.0,3,1340.0,0.0,1955.0,2005,Shoreline,WA 98133,USA
1,2384000.0,5.0,2.5,3650.0,9050.0,2.0,0.0,4.0,5,3370.0,280.0,1921.0,0,Seattle,WA 98119,USA
2,342000.0,3.0,2.0,1930.0,11947.0,1.0,0.0,0.0,4,1930.0,0.0,1966.0,0,Kent,WA 98042,USA
3,420000.0,3.0,2.25,2000.0,8030.0,1.0,0.0,0.0,4,1000.0,1000.0,1963.0,0,Bellevue,WA 98008,USA
4,550000.0,4.0,2.5,1940.0,10500.0,1.0,0.0,0.0,4,1140.0,800.0,1976.0,1992,Redmond,WA 98052,USA


In [18]:
print(df.isnull().sum())

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
city             0
statezip         0
country          0
dtype: int64


In [19]:
def changetoint64(col):
    for i in col:
        df[i] = df[i].astype('int64')
        
columns =[
    'price','bedrooms','bathrooms','sqft_living','sqft_lot',
    'floors','waterfront','view','sqft_above','sqft_basement',
    'yr_built']
changetoint64(columns)

In [20]:
print("-- Insights of Data --")
df.info()

-- Insights of Data --
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   price          4600 non-null   int64 
 1   bedrooms       4600 non-null   int64 
 2   bathrooms      4600 non-null   int64 
 3   sqft_living    4600 non-null   int64 
 4   sqft_lot       4600 non-null   int64 
 5   floors         4600 non-null   int64 
 6   waterfront     4600 non-null   int64 
 7   view           4600 non-null   int64 
 8   condition      4600 non-null   int64 
 9   sqft_above     4600 non-null   int64 
 10  sqft_basement  4600 non-null   int64 
 11  yr_built       4600 non-null   int64 
 12  yr_renovated   4600 non-null   int64 
 13  city           4600 non-null   object
 14  statezip       4600 non-null   object
 15  country        4600 non-null   object
dtypes: int64(13), object(3)
memory usage: 575.1+ KB


In [21]:
df['city'].unique()

array(['Shoreline', 'Seattle', 'Kent', 'Bellevue', 'Redmond',
       'Maple Valley', 'North Bend', 'Lake Forest Park', 'Sammamish',
       'Auburn', 'Des Moines', 'Bothell', 'Federal Way', 'Kirkland',
       'Issaquah', 'Woodinville', 'Normandy Park', 'Fall City', 'Renton',
       'Carnation', 'Snoqualmie', 'Duvall', 'Burien', 'Covington',
       'Inglewood-Finn Hill', 'Kenmore', 'Newcastle', 'Mercer Island',
       'Black Diamond', 'Ravensdale', 'Clyde Hill', 'Algona', 'Skykomish',
       'Tukwila', 'Vashon', 'Yarrow Point', 'SeaTac', 'Medina',
       'Enumclaw', 'Snoqualmie Pass', 'Pacific', 'Beaux Arts Village',
       'Preston', 'Milton'], dtype=object)

In [22]:
df['statezip'].unique()

array(['WA 98133', 'WA 98119', 'WA 98042', 'WA 98008', 'WA 98052',
       'WA 98115', 'WA 98038', 'WA 98045', 'WA 98155', 'WA 98105',
       'WA 98074', 'WA 98106', 'WA 98007', 'WA 98092', 'WA 98198',
       'WA 98006', 'WA 98102', 'WA 98011', 'WA 98125', 'WA 98003',
       'WA 98136', 'WA 98033', 'WA 98029', 'WA 98117', 'WA 98034',
       'WA 98072', 'WA 98023', 'WA 98107', 'WA 98166', 'WA 98116',
       'WA 98024', 'WA 98055', 'WA 98077', 'WA 98027', 'WA 98059',
       'WA 98075', 'WA 98014', 'WA 98065', 'WA 98199', 'WA 98053',
       'WA 98058', 'WA 98122', 'WA 98103', 'WA 98112', 'WA 98005',
       'WA 98118', 'WA 98177', 'WA 98004', 'WA 98019', 'WA 98144',
       'WA 98168', 'WA 98001', 'WA 98056', 'WA 98146', 'WA 98028',
       'WA 98148', 'WA 98057', 'WA 98040', 'WA 98010', 'WA 98051',
       'WA 98031', 'WA 98109', 'WA 98030', 'WA 98126', 'WA 98032',
       'WA 98178', 'WA 98288', 'WA 98108', 'WA 98070', 'WA 98188',
       'WA 98002', 'WA 98039', 'WA 98022', 'WA 98068', 'WA 980

In [23]:
df['country'].unique()

array(['USA'], dtype=object)

In [24]:
df_encoded = df.copy()

In [25]:
def encodeCols(cols):
    for i in cols:
        data = pd.DataFrame({i:df[i].unique()})
        data_label_encoder = LabelEncoder()
        data_label_encoder.fit(np.ravel(data))
        df_encoded[i] = data_label_encoder.transform(df[i]) 

columns = ['city','statezip','country']
encodeCols(columns)

In [30]:
# Labels
country = pd.DataFrame({'country':df['country'].unique()})

# Initializing Label Encoders
country_label_encoder = LabelEncoder()

# Training Label Encoder
country_label_encoder.fit(np.ravel(country))

In [31]:
df_encoded['country'] = country_label_encoder.transform(df['country']) 

In [32]:
# All the Attributes are Numerical 
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   price          4600 non-null   int64
 1   bedrooms       4600 non-null   int64
 2   bathrooms      4600 non-null   int64
 3   sqft_living    4600 non-null   int64
 4   sqft_lot       4600 non-null   int64
 5   floors         4600 non-null   int64
 6   waterfront     4600 non-null   int64
 7   view           4600 non-null   int64
 8   condition      4600 non-null   int64
 9   sqft_above     4600 non-null   int64
 10  sqft_basement  4600 non-null   int64
 11  yr_built       4600 non-null   int64
 12  yr_renovated   4600 non-null   int64
 13  city           4600 non-null   int32
 14  statezip       4600 non-null   int32
 15  country        4600 non-null   int32
dtypes: int32(3), int64(13)
memory usage: 521.2 KB


In [33]:
df_encoded.to_csv(r'encoded-data.csv', index = False, header = True)

In [34]:
traindata, testdata = train_test_split(df_encoded, test_size=0.2, shuffle=False)

In [35]:
traindata.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,statezip,country
0,313000,3,1,1340,7912,1,0,0,3,1340,0,1955,2005,36,62,0
1,2384000,5,2,3650,9050,2,0,4,5,3370,280,1921,0,35,58,0


In [36]:
testdata.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,statezip,country
3680,500000,4,3,3720,15048,3,0,0,3,3720,0,1979,2014,32,37,0
3681,570000,3,2,1890,29185,1,0,0,3,1470,420,1949,2013,3,5,0


In [37]:
X = df_encoded.drop('price', axis=1)
y = df_encoded['price']

In [38]:
train_x = traindata.iloc[:, 1:]
train_x.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,statezip,country
0,3,1,1340,7912,1,0,0,3,1340,0,1955,2005,36,62,0
1,5,2,3650,9050,2,0,4,5,3370,280,1921,0,35,58,0
2,3,2,1930,11947,1,0,0,4,1930,0,1966,0,18,26,0
3,3,2,2000,8030,1,0,0,4,1000,1000,1963,0,3,7,0
4,4,2,1940,10500,1,0,0,4,1140,800,1976,1992,31,31,0


In [39]:
train_y = traindata.iloc[:, 0]
train_y.head()

0     313000
1    2384000
2     342000
3     420000
4     550000
Name: price, dtype: int64

In [40]:
test_x = testdata.iloc[:, 1:]
test_x.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,statezip,country
3680,4,3,3720,15048,3,0,0,3,3720,0,1979,2014,32,37,0
3681,3,2,1890,29185,1,0,0,3,1470,420,1949,2013,3,5,0
3682,4,2,2680,12215,1,1,4,3,1590,1090,1956,2001,26,68,0
3683,3,2,1460,1613,2,0,0,3,1180,280,2007,0,35,56,0
3684,3,2,3180,7904,1,0,0,3,1810,1370,2006,0,32,34,0


In [41]:
test_y = testdata.iloc[:, 0]
test_y.head()

3680     500000
3681     570000
3682    1309500
3683     544000
3684     439000
Name: price, dtype: int64

In [42]:
# Splitting data with shuffle for randomness
train_data, test_data = train_test_split(df_encoded, test_size=0.2, random_state=42, shuffle=True)

# Preparing training and testing datasets
train_x = train_data.drop('price', axis=1)  # Features
train_y = train_data['price']              # Target variable
test_x = test_data.drop('price', axis=1)
test_y = test_data['price']

# Initialize and train Gradient Boosting Regressor
model_gbr = GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
model_gbr.fit(train_x, train_y)

# Predictions on test data
predictions = model_gbr.predict(test_x)

# Evaluate the model
mae = mean_absolute_error(test_y, predictions)
mse = mean_squared_error(test_y, predictions)
r2 = r2_score(test_y, predictions)

print("Gradient Boosting Regressor Model Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")


Gradient Boosting Regressor Model Performance:
Mean Absolute Error (MAE): 168904.46
Mean Squared Error (MSE): 966675644365.72
R-squared Score: 0.05
