In [42]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR


In [43]:


data = pd.read_csv("housing.csv")
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [44]:
label_encoder = LabelEncoder()

data['ocean_proximity_encoded'] = label_encoder.fit_transform(data['ocean_proximity'])
data.drop('ocean_proximity',axis=1,inplace=True)
data['total_bedrooms'] = data['total_bedrooms'].ffill()
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_encoded
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,3
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,3
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,3
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,3


In [45]:
X = data.drop("median_house_value", axis=1)
y = data["median_house_value"]

# For LinearRegression()

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [47]:
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [48]:
linreg_score = cross_val_score(LinearRegression(), X, y, cv=kf)
linreg_score


for i, result in enumerate(linreg_score, 1):
    print(f"Fold {i}: {result * 100:.2f}%")
    
print(f'Mean Accuracy: {linreg_score.mean()* 100:.2f}%')

Fold 1: 61.15%
Fold 2: 64.26%
Fold 3: 63.83%
Fold 4: 66.55%
Fold 5: 60.57%
Mean Accuracy: 63.27%


# For Decision Tree

In [49]:
decreg_score = cross_val_score(DecisionTreeRegressor(), X, y, cv=kf)

for i, result in enumerate(decreg_score, 1):
    print(f"Fold {i}: {result * 100:.2f}%")
    
print(f'Mean Accuracy: {decreg_score.mean()* 100:.2f}%')


Fold 1: 65.29%
Fold 2: 65.36%
Fold 3: 64.18%
Fold 4: 68.20%
Fold 5: 62.35%
Mean Accuracy: 65.08%


In [50]:
decreg_score = cross_val_score(DecisionTreeRegressor(max_depth=5), X, y, cv=kf)

for i, result in enumerate(decreg_score, 1):
    print(f"Fold {i}: {result * 100:.2f}%")
    
print(f'Mean Accuracy: {decreg_score.mean()* 100:.2f}%')

Fold 1: 61.66%
Fold 2: 63.79%
Fold 3: 61.87%
Fold 4: 64.99%
Fold 5: 62.07%
Mean Accuracy: 62.88%


In [51]:
decreg_score = cross_val_score(DecisionTreeRegressor(max_depth=10), X, y, cv=kf)

for i, result in enumerate(decreg_score, 1):
    print(f"Fold {i}: {result * 100:.2f}%")
    
print(f'Mean Accuracy: {decreg_score.mean()* 100:.2f}%')

Fold 1: 70.42%
Fold 2: 70.67%
Fold 3: 70.87%
Fold 4: 75.34%
Fold 5: 70.34%
Mean Accuracy: 71.53%


# For Random Forest Regressor

In [52]:
rfreg_score = cross_val_score(DecisionTreeRegressor(max_depth=10), X, y, cv=kf)

for i, result in enumerate(rfreg_score, 1):
    print(f"Fold {i}: {result * 100:.2f}%")
    
print(f'Mean Accuracy: {rfreg_score.mean()* 100:.2f}%')

Fold 1: 70.93%
Fold 2: 71.32%
Fold 3: 70.95%
Fold 4: 75.67%
Fold 5: 69.94%
Mean Accuracy: 71.76%


In [53]:
rfreg_score = cross_val_score(DecisionTreeRegressor(max_depth=5), X, y, cv=kf)

for i, result in enumerate(rfreg_score, 1):
    print(f"Fold {i}: {result * 100:.2f}%")
    
print(f'Mean Accuracy: {rfreg_score.mean()* 100:.2f}%')

Fold 1: 61.66%
Fold 2: 63.79%
Fold 3: 61.87%
Fold 4: 64.99%
Fold 5: 62.06%
Mean Accuracy: 62.87%


In [54]:
rfreg_score = cross_val_score(DecisionTreeRegressor(), X, y, cv=kf)

for i, result in enumerate(rfreg_score, 1):
    print(f"Fold {i}: {result * 100:.2f}%")
    
print(f'Mean Accuracy: {rfreg_score.mean()* 100:.2f}%')

Fold 1: 64.07%
Fold 2: 65.41%
Fold 3: 66.25%
Fold 4: 68.63%
Fold 5: 62.54%
Mean Accuracy: 65.38%


# For XGBoost

In [55]:
xgref = cross_val_score(XGBRegressor(max_depth=10, n_estimators=100), X, y, cv=kf)

for i, result in enumerate(xgref, 1):
    print(f"Fold {i}: {result * 100:.2f}%")
    
print(f'Mean Accuracy: {xgref.mean()* 100:.2f}%')

Fold 1: 81.78%
Fold 2: 82.57%
Fold 3: 81.34%
Fold 4: 84.17%
Fold 5: 81.11%
Mean Accuracy: 82.19%


In [56]:
xgref = cross_val_score(XGBRegressor(max_depth=5), X, y, cv=kf)

for i, result in enumerate(xgref, 1):
    print(f"Fold {i}: {result * 100:.2f}%")
    
print(f'Mean Accuracy: {xgref.mean()* 100:.2f}%')

Fold 1: 82.31%
Fold 2: 83.20%
Fold 3: 82.11%
Fold 4: 84.49%
Fold 5: 82.15%
Mean Accuracy: 82.85%


In [57]:
xgref = cross_val_score(XGBRegressor(), X, y, cv=kf)

for i, result in enumerate(xgref, 1):
    print(f"Fold {i}: {result * 100:.2f}%")
    
print(f'Mean Accuracy: {xgref.mean()* 100:.2f}%')

Fold 1: 82.01%
Fold 2: 83.35%
Fold 3: 81.80%
Fold 4: 84.04%
Fold 5: 82.24%
Mean Accuracy: 82.69%
