# Standard imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib import colors
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

# California Dataset

## Exploratory Data Analysis

### Storing and reading the data

In [None]:
california_data = pd.read_csv('California_Houses.csv')
california_data.head()

In [None]:
california_data.drop(['Latitude', 'Longitude'], axis = 1, inplace=True)
california_data.info()

In [None]:
california_data.describe().transpose()

In [None]:
california_data.corr()

In [None]:
california_data.isnull().sum()

### Representing initial data

In [None]:
for feature in california_data:
    if feature != 'Median_House_Value':
        plt.figure(figsize=(5, 4));
        plt.scatter(california_data[feature], california_data['Median_House_Value']);
        plt.ylabel('Median_House_Value');
        plt.xlabel(feature);
    plt.show();

### Separating the data into two separate tables to show the target column

In [None]:
X_california = california_data.drop('Median_House_Value', axis=1)
y_california = california_data['Median_House_Value']

## Feature Scaling and PCA

In [None]:
X_california = StandardScaler().fit_transform(X_california)
pca = PCA(n_components=None)

X_pca = pca.fit_transform(X_california)

explained_variance = pca.explained_variance_ratio_
print(explained_variance)

pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_california)
explained_variance = pca.explained_variance_ratio_
print(sum(explained_variance))

### Testing and Training the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_california, test_size=0.2, random_state=20)
print('Training Shape: ',X_train.shape)
print('Testing  Shape: ',X_test.shape)

## Evaluation Score Function

In [None]:
def eval(y_test, predictions, results=[]):
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)
    results = [mae, rmse, r2]
    print("Mean Absolute Error: ", round(mae, 5))
    print("Root Mean Squared Error: ", round(rmse, 5))
    print("R-squared (R2) Score:", round(r2, 5))
    return results

## Performing Linear Regression

### Model fitting and predictions

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Training R^2 Score:", train_score)
print("Testing R^2 Score:", test_score)

### Data representation

In [None]:
plt.figure(figsize=(8, 5));
plt.scatter(y_test,predictions)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual Prices vs Predicted Prices ($1000's)")
m, b = np.polyfit(y_test, predictions, 1)
plt.plot(y_test, m*y_test + b, color='red')
plt.show()

In [None]:
result = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
result = result.head(15)
result.plot(kind='bar')
plt.xlabel("Data Index")
plt.ylabel("Median_House_Value")

### Model Evaluation

In [None]:
results = []
lm = eval(y_test, predictions, results)

## Performing Random Forest

### Model fitting and predictions

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Training R^2 Score:", train_score)
print("Testing R^2 Score:", test_score)

### Data representation

In [None]:
plt.figure(figsize=(8, 5));
plt.scatter(y_test,predictions)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual Prices vs Predicted Prices ($1000's)")
m, b = np.polyfit(y_test, predictions, 1)
plt.plot(y_test, m*y_test + b)
plt.show()

In [None]:
result = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
result = result.head(15)
result.plot(kind='bar')
plt.xlabel("Data Index")
plt.ylabel("Median_House_Value")

### Model Evaluation

In [None]:
rfr = eval(y_test, predictions, results)

We observed a lower RMSE using Random Forest Regressor

## Performing Gradient Boost

### Model fitting and predictions

In [None]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Training R^2 Score:", train_score)
print("Testing R^2 Score:", test_score)

### Data representation

In [None]:
plt.figure(figsize=(8, 5));
plt.scatter(y_test,predictions)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual Prices vs Predicted Prices ($1000's)")
m, b = np.polyfit(y_test, predictions, 1)
plt.plot(y_test, m*y_test + b)
plt.show()

In [None]:
result = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
result = result.head(15)
result.plot(kind='bar')
plt.xlabel("Data Index")
plt.ylabel("Median_House_Value")

### Calculating Mean Squared Error and Root Mean Squared Error

In [None]:
gbr = eval(y_test, predictions, results)

We observed an even lower RMSE using Gradient Boost

## Dealing with Outliers

### IQR Method

In [None]:
column_names = ["Median_House_Value", "Median_Income", "Median_Age", "Tot_Rooms", 
                "Tot_Bedrooms", "Population", "Households",  "Distance_to_coast", 
                "Distance_to_LA", "Distance_to_SanDiego", "Distance_to_SanJose", 
                "Distance_to_SanFrancisco"]

In [None]:
Q1 = california_data[column_names].quantile(0.25)
Q3 = california_data[column_names].quantile(0.75)

IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = (california_data[column_names] < lower_bound) | (california_data[column_names] > upper_bound)
california_data_no_outliers = california_data[~outliers.any(axis=1)]
print(california_data_no_outliers.shape)

In [None]:
X_california = california_data_no_outliers.drop('Median_House_Value', axis=1)
y_california = california_data_no_outliers['Median_House_Value']

In [None]:
X_california = StandardScaler().fit_transform(X_california)
pca = PCA(n_components=None)

X_pca = pca.fit_transform(X_california)

explained_variance = pca.explained_variance_ratio_
print(explained_variance)

pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_california)
explained_variance = pca.explained_variance_ratio_
print(sum(explained_variance))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_california, test_size=0.2, random_state=20)
print('Training Shape: ',X_train.shape)
print('Testing  Shape: ',X_test.shape)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

lm_iqr = eval(y_test, predictions, results)

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

rfr_iqr = eval(y_test, predictions, results)

In [None]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

gbr_iqr = eval(y_test, predictions, results)

### Outlier Capping Method

In [None]:
california_data_no_outliers = california_data.copy()
for col in column_names:
    california_data_no_outliers[col] = california_data[col].clip(lower_bound[col], upper_bound[col])
print(california_data_no_outliers.shape)

In [None]:
X_california = california_data_no_outliers.drop('Median_House_Value', axis=1)
y_california = california_data_no_outliers['Median_House_Value']

In [None]:
X_california = StandardScaler().fit_transform(X_california)
pca = PCA(n_components=None)

X_pca = pca.fit_transform(X_california)

explained_variance = pca.explained_variance_ratio_
print(explained_variance)

pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_california)
explained_variance = pca.explained_variance_ratio_
print(sum(explained_variance))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_california, test_size=0.2, random_state=20)
print('Training Shape: ',X_train.shape)
print('Testing  Shape: ',X_test.shape)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

lm_cap = eval(y_test, predictions, results)

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

rfr_cap = eval(y_test, predictions, results)

In [None]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

gbr_cap = eval(y_test, predictions, results)

### Z Score Method

In [None]:
california_data_no_outliers = california_data.copy()
z_scores = (california_data[column_names] - california_data[column_names].mean()) / california_data[column_names].std()

threshold = 3
outliers = (z_scores > threshold) | (z_scores < -threshold)

california_data_no_outliers = california_data[~outliers.any(axis=1)]
print(california_data_no_outliers.shape)

In [None]:
X_california = california_data_no_outliers.drop('Median_House_Value', axis=1)
y_california = california_data_no_outliers['Median_House_Value']

In [None]:
X_california = StandardScaler().fit_transform(X_california)
pca = PCA(n_components=None)

X_pca = pca.fit_transform(X_california)

explained_variance = pca.explained_variance_ratio_
print(explained_variance)

pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_california)
explained_variance = pca.explained_variance_ratio_
print(sum(explained_variance))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_california, test_size=0.2, random_state=20)
print('Training Shape: ',X_train.shape)
print('Testing  Shape: ',X_test.shape)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

lm_z = eval(y_test, predictions, results)

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

rfr_z = eval(y_test, predictions, results)

In [None]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

gbr_z = eval(y_test, predictions, results)

## Hyperparameter Tuning

## Summary

In [None]:
model_metrics_dict = {
    'Linear': lm,
    'Random Forest': rfr,
    'Gradient Boost': gbr,
    'Linear(IQR)': lm_iqr,
    'Random Forest(IQR)': rfr_iqr,
    'Gradient Boost(IQR)': gbr_iqr,
    'Linear(Cap)': lm_cap,
    'Random Forest(Cap)': rfr_cap,
    'Gradient Boost(Cap)': gbr_cap,
    'Linear(Z-Score)': lm_z,
    'Random Forest(Z-Score)': rfr_z,
    'Gradient Boost(Z-Score)': gbr_z,
}

model_names = []
mae_values = []
rmse_values = []
r2_values = []

for model_name, metrics in model_metrics_dict.items():
    model_names.append(model_name)
    mae_values.append(metrics[0])
    rmse_values.append(metrics[1])
    r2_values.append(metrics[2])

model_metrics = pd.DataFrame({
    'Model': model_names,
    'MAE': mae_values,
    'RMSE': rmse_values,
    'R2': r2_values
})

print(model_metrics)