### Import package

In [264]:
import numpy as np 
#handel the datasets 
import pandas as pd 
#preprocessing 
from sklearn.preprocessing import StandardScaler
#feature selection
from sklearn.linear_model import LassoCV
#spliting dataset
from sklearn.model_selection import train_test_split
#Linear Regression
from sklearn.linear_model import LinearRegression
#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
#calculate accuracy
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
#lasso model
from sklearn.linear_model import Lasso

### Data loading and preprocessing

In [265]:
#load the datasets
df  = pd.read_csv("data.csv")

#display information about our dataset 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [266]:
#display 5 top values  
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


Remove the outliers from important features

In [267]:
# Specify the column to filter for outliers
columns_to_filter = ['price','sqft_living', 'sqft_lot']

In [268]:
# Remove outliers using the IQR method
for column in columns_to_filter:
    Q1 = df[column].quantile(0.25)  # 1st quartile
    Q3 = df[column].quantile(0.75)   # 3rd quartile
    IQR = Q3 - Q1                          # the range of Interquartile 
    lower_bound = Q1 - 1.5 * IQR           #  outlier detection [lower boung]
    upper_bound = Q3 + 1.5 * IQR           #  outlier detection [upper boung]

    # Remove rows that its column value is outside the previous bound
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

Create feature [Feature Engineering Part]

In [269]:
# Create [age] feature based on the year built
df['age'] = 2024 - df['yr_built']

# Create a binary [rennovated] feature 
df['rennovated'] = df['yr_renovated'].apply(lambda x: 0 if x == 0 else 1)

In [270]:
#checks about dataset columns

#print(df['street'].unique()) #this column is not need for model learning [alot of unique values]'
#print(df['statezip'].unique())
#print(df['city'].unique())
#print(df['country'].unique())

# numof uninquended values
#print(df['street'].nunique())
#print(df['statezip'].nunique())
#print(df['city'].nunique())
#print(df['country'].nunique())
#print(df['waterfront'].nunique())

#print(df['waterfront'].value_counts())# thsi column imbalanced column  
#print(df['sqft_basement'].value_counts())# thsi column imbalanced column  



In [271]:
# Standardize [numerical features]
scaler = StandardScaler()
df[['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition',
           'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'age']] = \
    scaler.fit_transform(df[['bedrooms', 'bathrooms', 'floors', 'waterfront',
                                     'view', 'condition', 'sqft_living', 'sqft_lot',
                                     'sqft_above', 'sqft_basement', 'age']])

In [272]:
df.head(10)

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country,age,rennovated
0,2014-05-02 00:00:00,313000.0,-0.367351,-0.794692,-0.833261,0.203026,0.01914,-0.039778,-0.253753,-0.670496,-0.465458,-0.683069,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA,0.490665,1
2,2014-05-02 00:00:00,342000.0,-0.367351,-0.075721,-0.003645,1.351127,-0.901053,-0.039778,-0.253753,0.814369,0.400965,-0.683069,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA,0.128492,0
3,2014-05-02 00:00:00,420000.0,-0.367351,0.283765,0.094784,0.236601,-0.901053,-0.039778,-0.253753,0.814369,-0.964753,1.795109,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA,0.227266,0
4,2014-05-02 00:00:00,550000.0,0.771587,0.64325,0.010416,0.939404,-0.901053,-0.039778,-0.253753,0.814369,-0.759161,1.299474,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA,-0.200757,1
5,2014-05-02 00:00:00,490000.0,-1.506289,-1.513663,-1.480081,-0.232882,-0.901053,-0.039778,-0.253753,-0.670496,-1.140974,-0.683069,1938,1994,522 NE 88th St,Seattle,WA 98115,USA,1.050387,1
6,2014-05-02 00:00:00,335000.0,-1.506289,-0.075721,-0.8192,-1.319808,-0.901053,-0.039778,-0.253753,-0.670496,-0.450773,-0.683069,1976,0,2616 174th Ave NE,Redmond,WA 98052,USA,-0.200757,0
9,2014-05-02 00:00:00,640000.0,0.771587,-0.075721,-0.580158,-0.284099,0.01914,-0.039778,-0.253753,-0.670496,-0.201126,-0.683069,1945,2010,6811 55th Ave NE,Seattle,WA 98115,USA,0.819914,1
10,2014-05-02 00:00:00,463000.0,-0.367351,-0.435207,-0.312993,0.034581,-0.901053,-0.039778,-0.253753,-0.670496,0.077892,-0.683069,1948,1994,Burke-Gilman Trail,Lake Forest Park,WA 98155,USA,0.721139,1
12,2014-05-02 00:00:00,588500.0,-0.367351,-0.435207,0.558807,2.189083,-0.901053,-0.039778,-0.253753,-0.670496,0.459705,0.209075,1980,0,1833 220th Pl NE,Sammamish,WA 98074,USA,-0.332456,0
13,2014-05-02 00:00:00,365000.0,-0.367351,-1.513663,-1.184794,-0.217233,-0.901053,-0.039778,-0.253753,0.814369,-0.832586,-0.683069,1955,2009,2504 SW Portland Ct,Seattle,WA 98106,USA,0.490665,1


In [273]:
# Extract the state from statezip
df['state'] = df['statezip'].str.extract(r'([A-Z]+)')

In [274]:
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,sqft_basement,yr_built,yr_renovated,street,city,statezip,country,age,rennovated,state
0,2014-05-02 00:00:00,313000.0,-0.367351,-0.794692,-0.833261,0.203026,0.01914,-0.039778,-0.253753,-0.670496,...,-0.683069,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA,0.490665,1,WA
2,2014-05-02 00:00:00,342000.0,-0.367351,-0.075721,-0.003645,1.351127,-0.901053,-0.039778,-0.253753,0.814369,...,-0.683069,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA,0.128492,0,WA
3,2014-05-02 00:00:00,420000.0,-0.367351,0.283765,0.094784,0.236601,-0.901053,-0.039778,-0.253753,0.814369,...,1.795109,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA,0.227266,0,WA
4,2014-05-02 00:00:00,550000.0,0.771587,0.64325,0.010416,0.939404,-0.901053,-0.039778,-0.253753,0.814369,...,1.299474,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA,-0.200757,1,WA
5,2014-05-02 00:00:00,490000.0,-1.506289,-1.513663,-1.480081,-0.232882,-0.901053,-0.039778,-0.253753,-0.670496,...,-0.683069,1938,1994,522 NE 88th St,Seattle,WA 98115,USA,1.050387,1,WA


In [275]:
# Drop  unneeded columns
df = df.drop(['street', 'statezip', 'yr_built', 'yr_renovated', 'date' ,'waterfront','sqft_lot'], axis=1)

In [276]:
# Convert categorical variables to dummy-variables
df = pd.get_dummies(df, columns=['city','state', 'country'], drop_first=True)

In [277]:
# select bool columns from new dataset
boolColumns = df.select_dtypes(np.bool_).columns

# Convert them to binary (0,1)
df[boolColumns] = df[boolColumns].astype(int)

In [278]:
# Ensure that there are no missing [Actually no null values OR Duplicate values] or duplicate values.

df.dropna(inplace = True)
df.drop_duplicates(inplace= True)


In [279]:
# Reset index 
df = df.reset_index(drop=True)

In [280]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,view,condition,sqft_above,sqft_basement,age,...,city_SeaTac,city_Seattle,city_Shoreline,city_Skykomish,city_Snoqualmie,city_Snoqualmie Pass,city_Tukwila,city_Vashon,city_Woodinville,city_Yarrow Point
0,313000.0,-0.367351,-0.794692,-0.833261,0.01914,-0.253753,-0.670496,-0.465458,-0.683069,0.490665,...,0,0,1,0,0,0,0,0,0,0
1,342000.0,-0.367351,-0.075721,-0.003645,-0.901053,-0.253753,0.814369,0.400965,-0.683069,0.128492,...,0,0,0,0,0,0,0,0,0,0
2,420000.0,-0.367351,0.283765,0.094784,-0.901053,-0.253753,0.814369,-0.964753,1.795109,0.227266,...,0,0,0,0,0,0,0,0,0,0
3,550000.0,0.771587,0.64325,0.010416,-0.901053,-0.253753,0.814369,-0.759161,1.299474,-0.200757,...,0,0,0,0,0,0,0,0,0,0
4,490000.0,-1.506289,-1.513663,-1.480081,-0.901053,-0.253753,-0.670496,-1.140974,-0.683069,1.050387,...,0,1,0,0,0,0,0,0,0,0


In [304]:
print(df.corr())
#sqft_lot                 -0.004268


                             price  bedrooms  bathrooms  sqft_living  \
price                     1.000000  0.265977   0.416876     0.580886   
bedrooms                  0.265977  1.000000   0.507908     0.618635   
bathrooms                 0.416876  0.507908   1.000000     0.707323   
sqft_living               0.580886  0.618635   0.707323     1.000000   
floors                    0.253062  0.145497   0.501151     0.330480   
view                      0.207945  0.067440   0.119355     0.185361   
condition                 0.056003  0.018195  -0.146757    -0.075186   
sqft_above                0.470193  0.468839   0.621533     0.832736   
sqft_basement             0.230014  0.298804   0.197404     0.356664   
age                      -0.020208 -0.153869  -0.521909    -0.323901   
rennovated               -0.066672 -0.066922  -0.250470    -0.144190   
city_Auburn              -0.189562  0.022850   0.002466    -0.023627   
city_Beaux Arts Village   0.021147 -0.005980  -0.007074    -0.01

In [282]:
#make features and target

#here we return all coiumn as a feature except for the column 'price'
x = df.drop('price' , axis= 1)

#here we return price as a target column
y = df['price']

In [283]:
print("Features (X):")
x.head()

Features (X):


Unnamed: 0,bedrooms,bathrooms,sqft_living,floors,view,condition,sqft_above,sqft_basement,age,rennovated,...,city_SeaTac,city_Seattle,city_Shoreline,city_Skykomish,city_Snoqualmie,city_Snoqualmie Pass,city_Tukwila,city_Vashon,city_Woodinville,city_Yarrow Point
0,-0.367351,-0.794692,-0.833261,0.01914,-0.253753,-0.670496,-0.465458,-0.683069,0.490665,1,...,0,0,1,0,0,0,0,0,0,0
1,-0.367351,-0.075721,-0.003645,-0.901053,-0.253753,0.814369,0.400965,-0.683069,0.128492,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.367351,0.283765,0.094784,-0.901053,-0.253753,0.814369,-0.964753,1.795109,0.227266,0,...,0,0,0,0,0,0,0,0,0,0
3,0.771587,0.64325,0.010416,-0.901053,-0.253753,0.814369,-0.759161,1.299474,-0.200757,1,...,0,0,0,0,0,0,0,0,0,0
4,-1.506289,-1.513663,-1.480081,-0.901053,-0.253753,-0.670496,-1.140974,-0.683069,1.050387,1,...,0,1,0,0,0,0,0,0,0,0


In [284]:
print("\nTarget (y):")
y.head()


Target (y):


0    313000.0
1    342000.0
2    420000.0
3    550000.0
4    490000.0
Name: price, dtype: float64

In [285]:
#spilt the data to train and test
x_train, x_test, y_train, y_test = train_test_split(x , y, test_size=0.2, random_state=42)

In [286]:
# Fit a Lasso 
lasso = LassoCV(cv=4).fit(x_train, y_train)

In [287]:
# get important features
coef = pd.Series(lasso.coef_, index=x_train.columns)

# print the features
print('Selected features by Lasso:', coef[coef != 0].index)

Selected features by Lasso: Index(['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'view', 'condition',
       'sqft_above', 'age', 'city_Auburn', 'city_Bellevue', 'city_Bothell',
       'city_Burien', 'city_Clyde Hill', 'city_Covington', 'city_Des Moines',
       'city_Duvall', 'city_Enumclaw', 'city_Federal Way', 'city_Issaquah',
       'city_Kenmore', 'city_Kent', 'city_Kirkland', 'city_Maple Valley',
       'city_Mercer Island', 'city_Newcastle', 'city_Redmond', 'city_Renton',
       'city_Sammamish', 'city_SeaTac', 'city_Seattle', 'city_Shoreline',
       'city_Snoqualmie', 'city_Tukwila', 'city_Woodinville'],
      dtype='object')


In [288]:
# keep only selected feature
newDf = df[coef[coef != 0].index]

In [289]:
# Split again with newDf
x_train, x_test, y_train, y_test = train_test_split(newDf, y, test_size=0.2, random_state=42)

---

### models 

we selected here this models:
* gradient boosting regressor
* Random Forest Regressor
* Linear Regression
* Decision Tree Regressor
---

Linear Regression model

In [291]:
#create the model
LG = LinearRegression()

#train the model
LG.fit(x_train, y_train)

print("Model trained successfully :) ")

Model trained successfully :) 


In [292]:
# Predicted  using x_test values

y_pred = LG.predict(x_test)

print("Predicted values:")
print(y_pred)

Predicted values:
[ 617236.2353002   416531.94806458  494406.6130823   694069.56099316
  453434.14486832  446985.62088124  353082.77262339  314358.74887997
  481353.75516211  496121.2106152   748434.6199629   345456.30273042
  455580.47268066  483029.65476131  439081.46246845  218919.5238046
  521780.46344112  519449.42450242  516391.23780411  213523.33261034
  691404.76466782  355923.2088915   295045.55295961  148553.01402313
  811950.83544669  697155.12086896  468288.18297892  810603.72479941
  335058.85625306  717558.08403083  236461.92469804  395282.34544232
  414204.24488349  249265.13898223  758843.64340473  730209.35675156
  390893.07269278  647699.15372154  489950.66976076  752493.31750147
  485772.94878021  414215.16842524  548629.28533885  517025.35111975
  162040.03536562  264276.52474504  301213.06584706  392672.46555805
  871421.30027111  741758.07425646  345309.27375425  465291.34497152
  373215.68209484  227245.03262826  699165.6698329   138260.07479689
  424926.73781853

In [293]:
print("<< Linear Regression model >>")
#calculate the MAE of the model 
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

#calculate MSE Functio
print("\n================================================\n")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

#calculate R^2
print("\n================================================\n")
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.2f}")

<< Linear Regression model >>
Mean Absolute Error (MAE): 86798.94


Mean Squared Error (MSE): 16435374512.86


R² Score: 0.63


Decision Tree Regressor

In [294]:
#create the model
DTR = DecisionTreeRegressor(random_state=42)

#train the model
DTR.fit(x_train, y_train)

print("Model trained successfully :) ")

Model trained successfully :) 


In [295]:
# Predicted  using x_test values

y_pred = DTR.predict(x_test)

print("Predicted values:")
print(y_pred)

Predicted values:
[ 941000.        412000.        317000.        805000.
  442500.        415000.        289950.        270000.
  565000.        488000.        819995.        445000.
  433111.111111  442500.        299900.        174500.
  550000.        785200.        578000.        194000.
  610000.        510000.        225000.        172500.
  899950.        585000.        710000.        549900.
  160000.        723243.75      370000.        525000.
  374000.        250500.        930000.        805000.
  600000.        640000.        575000.        720000.
  612500.        374000.        471001.        430000.
  270000.        300000.        215000.        225000.
 1030000.        571000.        300000.        360000.
  490000.        280000.        760000.        149500.
  594000.        337000.        550000.        175000.
  312000.        349000.        255000.        395000.
  330000.       1035000.        555000.        605000.
  540833.333333  470000.        661500.        

In [296]:
print("<< Decision Tree Regressor model >>")
#calculate the MAE of the model 
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

#calculate MSE Functio
print("\n================================================\n")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

#calculate R^2
print("\n================================================\n")
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.2f}")

<< Decision Tree Regressor model >>
Mean Absolute Error (MAE): 117788.56


Mean Squared Error (MSE): 29973731902.11


R² Score: 0.33


Random Forest Regressor

In [297]:
#create the model
RFR = RandomForestRegressor(random_state=42)

#train the model
RFR.fit(x_train, y_train)

print("Model trained successfully :) ")

Model trained successfully :) 


In [298]:
# Predicted  using x_test values

y_pred = RFR.predict(x_test)

print("Predicted values:")
print(y_pred)

Predicted values:
[761888.50000002 466334.91640212 398530.75       808476.54
 434303.27       406167.81       321794.6        309790.625
 500749.10000001 459775.16666668 738948.59       361804.48214286
 456510.49999999 457652.215      362157.47780219 214932.5
 563821.73333333 537298.88571428 551526.49       213884.29666663
 665438.7        438979.3        294668.56       198486.3
 940251.92       659498.41       497592.1040171  766079.82
 209898.748      722656.1725     316202.26333333 485011.08547009
 364495.255      268693.72333333 837602.8        837207.
 403685.06       707636.91       499372.69735043 794209.88
 503940.95333334 353511.86333333 554891.68846154 567240.42
 240310.9230769  286407.325      242594.66666667 360172.62888889
 963701.38       716718.         324950.         374760.86333333
 266401.70000003 244818.595      715681.58       171452.
 582511.77000001 340424.56666667 519531.09333334 283933.365
 442207.67       459765.49444444 296046.22857143 537200.
 326564.857142

In [299]:
print("<< Random Forest Regressor model >>")
#calculate the MAE of the model 
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

#calculate MSE Functio
print("\n================================================\n")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

#calculate R^2
print("\n================================================\n")
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.2f}")

<< Random Forest Regressor model >>
Mean Absolute Error (MAE): 88470.35


Mean Squared Error (MSE): 16894782652.29


R² Score: 0.62


Gradient Boosting Regressor

In [300]:
#create the model
GBR = GradientBoostingRegressor(random_state=42)

#train the model
GBR.fit(x_train, y_train)

print("Model trained successfully :) ")

Model trained successfully :) 


In [301]:
# Predicted  using x_test values

y_pred = GBR.predict(x_test)

print("Predicted values:")
print(y_pred)

Predicted values:
[665490.19986608 411781.45596921 453092.03981786 720020.43783864
 412700.78357655 389977.05834171 358215.99685108 342920.60393902
 453454.63313012 480014.5481387  717158.60672446 350371.98005977
 438314.10964703 479764.13180289 440470.97017481 194142.64367046
 577588.56202027 569733.93696942 519834.0695886  250999.85002366
 703486.64684062 301509.22395902 279939.67748632 198183.38496667
 847552.56154065 688007.25789627 449175.941874   784890.5455373
 385237.84433003 753395.15461243 250999.85002366 376954.06308791
 412797.32855325 288734.00544518 719564.34639387 766865.89705805
 383309.15156282 666483.47540593 468413.41352946 718652.2636479
 515376.23836895 412797.32855325 498327.38810426 530668.52131729
 224508.14149967 247455.51523367 314401.44807865 362376.44475299
 904820.37923237 777012.42321237 337514.53281896 388832.6083487
 361852.8714103  288257.5853575  659939.44919258 209999.03484503
 426995.77759436 393334.82843296 503718.9823147  307704.77096088
 439762.51

In [302]:
print("<< Gradient Boosting Regressor model >>")
#calculate the MAE of the model 
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

#calculate MSE Functio
print("\n================================================\n")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

#calculate R^2
print("\n================================================\n")
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.2f}")

<< Gradient Boosting Regressor model >>
Mean Absolute Error (MAE): 85528.75


Mean Squared Error (MSE): 15292367665.87


R² Score: 0.66
