In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pandas.plotting import scatter_matrix

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR


from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor

In [None]:
df=pd.read_csv("/kaggle/input/california-housing-prices-data-extra-features/California_Houses.csv")

In [None]:
df.head()

In [None]:
df['bedrooms_per_household']=df.Tot_Bedrooms/df.Households
df['rooms_per_household']=df.Tot_Rooms/df.Households
df['persons_per_household']=df.Population/df.Households

In [None]:
df.info()

In [None]:
abs(df.corr()).Median_House_Value.sort_values(ascending=False)

In [None]:
train,val=train_test_split(df)
X,y=train.drop('Median_House_Value',axis=1),train.Median_House_Value
train.shape,val.shape,X.shape,y.shape

# Baseline

In [None]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = KNeighborsRegressor()
scoring = 'neg_mean_absolute_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print("MAE: %.3f (%.3f)" % (results.mean(), results.std()))

# MinMaxScaler - StandardScaler - Normalizer

In [None]:
scalers=[('MinMaxScaler',MinMaxScaler()),
         ('StandardScaler',StandardScaler()),
         ('Normalizer',Normalizer())]

kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = KNeighborsRegressor()
scoring = 'neg_mean_absolute_error'

for name,scaler in scalers:
    pipe=Pipeline([('scaler',scaler),('model',model)])
    results = cross_val_score(pipe, X, y, cv=kfold, scoring=scoring)
    print(name, results.mean())

# SelectKBest

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
skb = SelectKBest(score_func=f_classif, k='all')
skb.fit(X,y)

In [None]:
pd.DataFrame(data=skb.scores_,index=X.columns,columns=['scores']).sort_values(by='scores').plot(kind='barh')

In [None]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = KNeighborsRegressor()
scoring = 'neg_mean_absolute_error'

for n in range(1,17,1):
    skb = SelectKBest(score_func=f_classif, k=n)
    pipe=Pipeline([('skb',skb),('model',model)])
    results = cross_val_score(pipe, X, y, cv=kfold, scoring=scoring)
    print(n, results.mean())

# QuantileTransformer

In [None]:
from sklearn.preprocessing import QuantileTransformer

kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = KNeighborsRegressor()
qt=QuantileTransformer()
scoring = 'neg_mean_absolute_error'

skb = SelectKBest(score_func=f_classif, k=n)
pipe=Pipeline([('qt',qt),('model',model)])
results = cross_val_score(pipe, X, y, cv=kfold, scoring=scoring)
print(results.mean())

# PCA

In [None]:
from sklearn.decomposition import PCA

kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = KNeighborsRegressor()
scoring = 'neg_mean_absolute_error'

for n in range(1,17,1):
    pca = PCA(n_components=n)
    pipe=Pipeline([('pca',pca),('model',model)])
    results = cross_val_score(pipe, X, y, cv=kfold, scoring=scoring)
    print(n, results.mean())

# Values of k 

In [None]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
scoring = 'neg_mean_absolute_error'

for k in range(1,21,1):
    model = KNeighborsRegressor(n_neighbors=k)
    results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    print(k, results.mean())

# Ensemble

In [None]:
ensembles = []
ensembles.append(('AB', AdaBoostRegressor()))
ensembles.append(('GBM', GradientBoostingRegressor()))
ensembles.append(('RF', RandomForestRegressor(n_estimators=10)))
ensembles.append(('ET', ExtraTreesRegressor(n_estimators=10)))

results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=10, random_state=0, shuffle=True)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)