In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


In [2]:
df = pd.read_csv("kc_house_data.csv")
df.head()


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [4]:
target = "price"


In [5]:
numerical_features = df.select_dtypes(include=["int64", "float64"]).columns
categorical_features = ["waterfront", "view", "condition", "zipcode"]

numerical_features, categorical_features


(Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
        'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
        'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
        'sqft_living15', 'sqft_lot15'],
       dtype='object'),
 ['waterfront', 'view', 'condition', 'zipcode'])

In [6]:
imputer = SimpleImputer(strategy="median")
df[numerical_features] = imputer.fit_transform(df[numerical_features])


In [7]:
df_encoded = pd.get_dummies(
    df, columns=categorical_features, drop_first=True
)
df_encoded.head()


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,grade,sqft_above,...,zipcode_98146.0,zipcode_98148.0,zipcode_98155.0,zipcode_98166.0,zipcode_98168.0,zipcode_98177.0,zipcode_98178.0,zipcode_98188.0,zipcode_98198.0,zipcode_98199.0
0,7129301000.0,20141013T000000,221900.0,3.0,1.0,1180.0,5650.0,1.0,7.0,1180.0,...,False,False,False,False,False,False,True,False,False,False
1,6414100000.0,20141209T000000,538000.0,3.0,2.25,2570.0,7242.0,2.0,7.0,2170.0,...,False,False,False,False,False,False,False,False,False,False
2,5631500000.0,20150225T000000,180000.0,2.0,1.0,770.0,10000.0,1.0,6.0,770.0,...,False,False,False,False,False,False,False,False,False,False
3,2487201000.0,20141209T000000,604000.0,4.0,3.0,1960.0,5000.0,1.0,7.0,1050.0,...,False,False,False,False,False,False,False,False,False,False
4,1954401000.0,20150218T000000,510000.0,3.0,2.0,1680.0,8080.0,1.0,8.0,1680.0,...,False,False,False,False,False,False,False,False,False,False


In [12]:
df_encoded = df_encoded.drop("date", axis=1)


In [13]:
X = df_encoded.drop("price", axis=1)
y = df_encoded["price"]


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [15]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
lr_r2 = r2_score(y_test, y_pred_lr)

lr_r2


0.8088453985177121

In [16]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
dt_r2 = r2_score(y_test, y_pred_dt)

dt_r2


0.7185676679908

In [17]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
knn_r2 = r2_score(y_test, y_pred_knn)

knn_r2


0.33098281638388627

In [18]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
rf_r2 = r2_score(y_test, y_pred_rf)

rf_r2


0.8514268235511361

In [19]:
svr = SVR()
svr.fit(X_train, y_train)

y_pred_svr = svr.predict(X_test)
svr_r2 = r2_score(y_test, y_pred_svr)

svr_r2


-0.06530729941344515

In [20]:
results = pd.DataFrame({
    "Model": [
        "Linear Regression",
        "Decision Tree",
        "KNN",
        "Random Forest",
        "SVR"
    ],
    "R2 Score": [
        lr_r2,
        dt_r2,
        knn_r2,
        rf_r2,
        svr_r2
    ]
})

results.sort_values(by="R2 Score", ascending=False)


Unnamed: 0,Model,R2 Score
3,Random Forest,0.851427
0,Linear Regression,0.808845
1,Decision Tree,0.718568
2,KNN,0.330983
4,SVR,-0.065307


In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


In [22]:
base_models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}


In [23]:
train_predictions = pd.DataFrame()
test_predictions = pd.DataFrame()

for name, model in base_models.items():
    model.fit(X_train, y_train)
    
    train_predictions[name] = model.predict(X_train)
    test_predictions[name] = model.predict(X_test)


In [24]:
train_predictions.head()


Unnamed: 0,Linear Regression,Decision Tree,KNN,Random Forest
0,404464.803996,325000.0,363370.0,348904.5
1,237999.907182,257000.0,450280.0,275942.24
2,123826.479464,228500.0,426300.0,217083.5
3,302232.01903,288000.0,252500.0,288355.9
4,456622.096839,479000.0,553568.4,496111.7


In [25]:
meta_model = LinearRegression()
meta_model.fit(train_predictions, y_train)


0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [26]:
stacked_pred = meta_model.predict(test_predictions)
stacked_r2 = r2_score(y_test, stacked_pred)

stacked_r2


0.7185676679907993

In [27]:
comparison = pd.DataFrame({
    "Model": ["Random Forest", "Stacked Model"],
    "R2 Score": [rf_r2, stacked_r2]
})

comparison


Unnamed: 0,Model,R2 Score
0,Random Forest,0.851427
1,Stacked Model,0.718568


In [28]:
comparison = pd.DataFrame({
    "Model": ["Best Base Model (Random Forest)", "Stacking Ensemble"],
    "R2 Score": [rf_r2, stacked_r2]
})

comparison


Unnamed: 0,Model,R2 Score
0,Best Base Model (Random Forest),0.851427
1,Stacking Ensemble,0.718568


In [29]:
if stacked_r2 > rf_r2:
    print("Stacking model performs better than the best individual model.")
else:
    print("Best individual model performs better than stacking.")


Best individual model performs better than stacking.
