
Optimization methods - find the parameters for the prediction function:
    OLS:
    https://towardsdatascience.com/understanding-the-ols-method-for-simple-linear-regression-e0a4e8f692cc

    Gradient Descent:
    https://towardsdatascience.com/linear-regression-using-gradient-descent-97a6c8700931
    https://medium.com/@shuklapratik22/linear-regression-with-gradient-descent-from-scratch-d03dfa90d04c

With regularization:
https://medium.com/@vigneshmadanan/linear-regression-basics-and-regularization-methods-b40359b0aea5

A complete guide:
https://www.analyticsvidhya.com/blog/2017/06/a-comprehensive-guide-for-linear-ridge-and-lasso-regression/
https://www.keboola.com/blog/linear-regression-machine-learning

Linear regression examples:
https://medium.com/datadriveninvestor/simple-linear-regression-with-python-1b028386e5cd
https://jakevdp.github.io/PythonDataScienceHandbook/05.06-linear-regression.html
https://stackabuse.com/linear-regression-in-python-with-scikit-learn/

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

In [None]:
df = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')

In [None]:
df.info() #target is median_house_value

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
df['total_bedrooms'].isna().sum()

In [None]:
df[df['total_bedrooms'].isna()]

In [None]:
df[df['total_bedrooms'].notna()]

In [None]:
bins = np.linspace(min(df['total_bedrooms']), max(df['total_bedrooms']), num=20)
ax1 = sns.distplot(df['total_bedrooms'], bins=bins, color='gold', kde=True, hist_kws=dict(edgecolor='k', lw=1)) 
ax2 = sns.distplot(df['total_bedrooms'].fillna(value=df['total_bedrooms'].mean()), bins=bins, color='green', kde=True, hist_kws=dict(edgecolor='k', lw=1))

In [None]:
bins = np.linspace(min(df['total_bedrooms']), max(df['total_bedrooms']), num=20)
ax1 = sns.distplot(df['total_bedrooms'], bins=bins, color='gold', kde=True, hist_kws=dict(edgecolor='k', lw=1)) 
ax2 = sns.distplot(df['total_bedrooms'].fillna(value=df['total_bedrooms'].median()), bins=bins, color='red', kde=True, hist_kws=dict(edgecolor='k', lw=1)) 

In [None]:
x = df.drop(columns='median_house_value')
y = df['median_house_value']

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3)

In [None]:
trainset = pd.concat([xtrain, ytrain], axis=1)
testset = pd.concat([xtest, ytest], axis=1)

Target variable

In [None]:
trainset['median_house_value'].describe()

In [None]:
trainset['median_house_value'].hist(bins=20)

In [None]:
trainset[trainset['median_house_value']>=475000]

In [None]:
sns.distplot(trainset['median_house_value'])

In [None]:
print("Skewness: %f" % trainset['median_house_value'].skew()) # normal distribution: 0
print("Kurtosis: %f" % trainset['median_house_value'].kurt()) # normal distribution: 3

EDA

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms", "total_bedrooms",
              "housing_median_age", "population"]
scatter_matrix(trainset[attributes], figsize=(20, 15))

In [None]:
plt.figure(figsize=(12, 10))
sns.scatterplot(data=trainset, x="longitude", y="latitude", hue="median_house_value", alpha=0.1, s=80)

!['california_map](https://i.pinimg.com/originals/35/63/9a/35639a247a078b89f72d848c305f7efe.jpg)

In [None]:
plt.figure(figsize=(12, 10))
sns.scatterplot(data=trainset, x="longitude", y="latitude", hue="median_income", alpha=0.1, s=80)

In [None]:
trainset['housing_median_age'].describe()

In [None]:
sns.scatterplot(data=trainset, x="housing_median_age", y="median_income", alpha=0.5, s=80)

In [None]:
sns.scatterplot(data=trainset, x="housing_median_age", y="median_house_value", alpha=0.5, s=80)

In [None]:
trainset['housing_median_age'].hist(bins=20)

In [None]:
plt.figure(figsize=(12, 10))
sns.scatterplot(data=trainset[trainset['housing_median_age']>=52], x="longitude", y="latitude", hue="median_house_value", alpha=0.5, s=80)  # two main clusters (San Francisco & Los Angeles)

Impute Nulls

In [None]:
median = trainset['total_bedrooms'].median()

In [None]:
trainset['total_bedrooms'].fillna(value=median, inplace=True)
testset['total_bedrooms'].fillna(value=median, inplace=True)

New features

In [None]:
trainset['bedrooms_per_household'] = trainset['total_bedrooms']/trainset['households']
trainset['bedrooms_per_room'] = trainset['total_bedrooms']/trainset['total_rooms']

testset['bedrooms_per_household'] = testset['total_bedrooms']/testset['households']
testset['bedrooms_per_room'] = testset['total_bedrooms']/testset['total_rooms']

Encode categorical variables

In [None]:
trainset.drop(trainset[trainset['ocean_proximity']=='ISLAND'].index, inplace=True)
testset.drop(testset[testset['ocean_proximity']=='ISLAND'].index, inplace=True)

In [None]:
trainset.info()

In [None]:
trainset = pd.get_dummies(trainset, columns=['ocean_proximity'], sparse=False, 
                              drop_first=True)
testset = pd.get_dummies(testset, columns=['ocean_proximity'], sparse=False, 
                              drop_first=True)

In [None]:
trainset.info()

New feature

In [None]:
trainset['population_per_households'] = trainset['population'] / trainset['households']    #new feature
testset['population_per_households'] = testset['population'] / testset['households']   

Remove multicollinearity

In [None]:
import seaborn as sns
sns.heatmap(trainset.corr(), annot=True)

In [None]:
corr_matrix = trainset.corr().abs()
high_corr_var=np.where(corr_matrix>0.5)
high_corr_var=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]

In [None]:
high_corr_var

In [None]:
high_corr_var = ['total_rooms', 'total_bedrooms', 'population', 'median_income', 'households',  'median_house_value', 'bedrooms_per_room',
                'bedrooms_per_household']

In [None]:
plt.figure(figsize=(14,14))
sns.heatmap(trainset[high_corr_var].corr(), annot=True)

In [None]:
trainset.drop(columns=['total_bedrooms', 'population', 'bedrooms_per_household', 'households'], inplace=True)
testset.drop(columns=['total_bedrooms', 'population', 'bedrooms_per_household', 'households'], inplace=True)

In [None]:
trainset.info()

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age", "bedrooms_per_room", "population_per_households"]
scatter_matrix(trainset[attributes], figsize=(20, 15))

In [None]:
trainset.describe()

In [None]:
def get_iqr_results(num_series, k=1.5):
    # calculate percentiles and IQR
    q25 = np.percentile(num_series, 25)
    q75 = np.percentile(num_series, 75)
    iqr = q75 - q25
    
    # calculate normal and extreme upper and lower cut off
    cutoff = iqr * k
    lower = q25 - cutoff 
    upper = q75 + cutoff
    
    result = {
        'lower': lower,
        'upper': upper}
    
    return result

In [None]:
trainset.info()

In [None]:
numerical_columns = ['housing_median_age', 'total_rooms', 'median_income', 'bedrooms_per_room', 'population_per_households']

In [None]:
column_limits = {}

In [None]:
for column in numerical_columns:
    column_limits[column] = get_iqr_results(trainset[column])

In [None]:
column_limits

In [None]:
trainset.shape

In [None]:
for column in numerical_columns:
    trainset.loc[trainset[column]<column_limits[column]['lower'], column] = column_limits[column]['lower']
    trainset.loc[trainset[column]>column_limits[column]['upper'], column] = column_limits[column]['upper']
    testset.loc[testset[column]<column_limits[column]['lower'], column] = column_limits[column]['lower']
    testset.loc[testset[column]>column_limits[column]['upper'], column] = column_limits[column]['upper']

In [None]:
trainset.describe()

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age", "bedrooms_per_room", "population_per_households"]
scatter_matrix(trainset[attributes], figsize=(20, 15))

Model development

In [None]:
xtrain = trainset.drop(columns=["median_house_value"])
ytrain = trainset["median_house_value"]

xtest = testset.drop(columns=["median_house_value"])
ytest = testset["median_house_value"]

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
scaler.fit(xtrain)
xtrain_scaled = scaler.transform(xtrain)
xtest_scaled = scaler.transform(xtest)

In [None]:
xtrain_scaled = pd.DataFrame(xtrain_scaled, index=xtrain.index, columns=xtrain.columns)
xtest_scaled = pd.DataFrame(xtest_scaled, index=xtest.index, columns=xtest.columns)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
lin_reg=LinearRegression()
lin_reg.fit(xtrain_scaled,ytrain)

In [None]:
ypred = lin_reg.predict(xtest_scaled)

In [None]:
mse_test = mean_squared_error(ytest,ypred)
rmse_test  = np.sqrt(mse_test)

In [None]:
rmse_test

In [None]:
ytpred = lin_reg.predict(xtrain_scaled)

In [None]:
mse_train = mean_squared_error(ytrain,ytpred)
rmse_train = np.sqrt(mse_train)

In [None]:
rmse_train

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(xtrain_scaled, ytrain)

In [None]:
ypred = forest_reg.predict(xtest_scaled)
ytpred = forest_reg.predict(xtrain_scaled)

In [None]:
rf_rmse_train = np.sqrt(mean_squared_error(ytrain,ytpred))
rf_rmse_test = np.sqrt(mean_squared_error(ytest,ypred))

In [None]:
rf_rmse_train

In [None]:
rf_rmse_test #overfitting

In [None]:
svm_reg = SVR(kernel="linear")
svm_reg.fit(xtrain_scaled, ytrain)
ytpred = svm_reg.predict(xtrain_scaled)
ypred = svm_reg.predict(xtest_scaled)

In [None]:
svm_mse_train = mean_squared_error(ytrain, ytpred)
svm_mse_test = mean_squared_error(ytest, ypred)
svm_rmse_train = np.sqrt(svm_mse_train)
svm_rmse_test = np.sqrt(svm_mse_test)

In [None]:
svm_rmse_train

In [None]:
svm_rmse_test

In [None]:
xtrain_scaled.shape

In [None]:
forest_reg = RandomForestRegressor(n_estimators=15, max_features=4, max_depth=8, random_state=42)
forest_reg.fit(xtrain_scaled, ytrain)
ypred = forest_reg.predict(xtest_scaled)
ytpred = forest_reg.predict(xtrain_scaled)
rf_rmse_train = np.sqrt(mean_squared_error(ytrain,ytpred))
rf_rmse_test = np.sqrt(mean_squared_error(ytest,ypred))

In [None]:
rf_rmse_train

In [None]:
rf_rmse_test

In [None]:
params = [
    {'n_estimators': [10, 15, 20, 30], 'max_features': [2, 4, 6, 8], 'min_samples_split':[2,4],
     'min_samples_leaf':[1,2,3], 'bootstrap':[True, False]}
  ]

forest_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(forest_reg, params, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True)
grid_search.fit(xtrain_scaled, ytrain)

In [None]:
grid_search.best_params_

In [None]:
grid_search.cv_results_

In [None]:
forest_reg = RandomForestRegressor(n_estimators=30, max_features=2, bootstrap=False, min_samples_leaf=1, 
                                   min_samples_split=2, random_state=42)
forest_reg.fit(xtrain_scaled, ytrain)
ypred = forest_reg.predict(xtest_scaled)
ytpred = forest_reg.predict(xtrain_scaled)
rf_rmse_train = np.sqrt(mean_squared_error(ytrain,ytpred))
rf_rmse_test = np.sqrt(mean_squared_error(ytest,ypred))

In [None]:
rf_rmse_train #overfitting a lot

In [None]:
rf_rmse_test

In [None]:
list(zip(xtrain_scaled.columns, list(forest_reg.feature_importances_)))

In [None]:
params = [
    {'n_estimators': [15, 20, 30], 'max_features': [2, 4], 'min_samples_split':[3,4],
     'min_samples_leaf':[2,3], 'bootstrap':[True, False], 'max_depth': [3, 4, 6]}
  ]

forest_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(forest_reg, params, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True)
grid_search.fit(xtrain_scaled, ytrain)

In [None]:
grid_search.best_params_

In [None]:
forest_reg = RandomForestRegressor(n_estimators=10, max_features=4, max_depth=14, bootstrap=True, random_state=42, max_samples=0.7,  max_leaf_nodes=35)
# min_samples_split=10, min_samples_leaf=14, 
forest_reg.fit(xtrain_scaled, ytrain)
ypred = forest_reg.predict(xtest_scaled)
ytpred = forest_reg.predict(xtrain_scaled)
rf_rmse_train = np.sqrt(mean_squared_error(ytrain,ytpred))
rf_rmse_test = np.sqrt(mean_squared_error(ytest,ypred))

In [None]:
rf_rmse_train

In [None]:
rf_rmse_test

In [None]:
params = [
    {'n_estimators': [10, 15, 20], 'max_features': [4, 6], 'min_samples_split':[8, 10, 12],
     'min_samples_leaf':[2, 4, 8], 'bootstrap':[True, False], 'max_depth': [10, 12, 14], 'max_leaf_nodes':[20, 25, 30, 35]}
  ]

forest_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(forest_reg, params, cv=3, scoring='neg_root_mean_squared_error', return_train_score=True)
grid_search.fit(xtrain_scaled, ytrain)

In [None]:
grid_search.best_params_

In [None]:
forest_reg = RandomForestRegressor(n_estimators=20, max_features=6, max_depth=12, min_samples_leaf=8, min_samples_split=8,
                                   bootstrap=True, random_state=42, max_leaf_nodes=35)
# min_samples_split=10, min_samples_leaf=14, 
forest_reg.fit(xtrain_scaled, ytrain)
ypred = forest_reg.predict(xtest_scaled)
ytpred = forest_reg.predict(xtrain_scaled)
rf_rmse_train = np.sqrt(mean_squared_error(ytrain,ytpred))
rf_rmse_test = np.sqrt(mean_squared_error(ytest,ypred))

In [None]:
rf_rmse_train

In [None]:
rf_rmse_test

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(ytrain,ytpred)

In [None]:
r2_score(ytest,ypred)

In [None]:
forest_reg_basic = RandomForestRegressor(random_state=42)
forest_reg_basic.fit(xtrain_scaled, ytrain)
ypredb = forest_reg_basic.predict(xtest_scaled)
ytpredb = forest_reg_basic.predict(xtrain_scaled)
rf_rmse_trainb = np.sqrt(mean_squared_error(ytrain,ytpredb))
rf_rmse_testb = np.sqrt(mean_squared_error(ytest,ypredb))

In [None]:
rf_rmse_trainb

In [None]:
rf_rmse_testb

In [None]:
r2_score(ytrain,ytpredb)

In [None]:
r2_score(ytest,ypredb)

In [None]:
from sklearn.linear_model import Ridge

In [None]:
rr = Ridge(alpha=0.1) 

In [None]:
rr.fit(xtrain_scaled, ytrain)

In [None]:
ypred_ridge = rr.predict(xtest_scaled)
ypredt_ridge = rr.predict(xtrain_scaled)

In [None]:
rr_rmse_trainb = np.sqrt(mean_squared_error(ytrain,ypredt_ridge))
rr_rmse_testb = np.sqrt(mean_squared_error(ytest, ypred_ridge))

In [None]:
rr_rmse_trainb

In [None]:
rr_rmse_testb

In [None]:
r2_score(ytrain,ypredt_ridge)

In [None]:
r2_score(ytest,ypred_ridge)