In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import matplotlib.gridspec as gridspec
import seaborn as sns 
import math 
import re
from IPython.display import display
from PIL import Image
import warnings
warnings.filterwarnings("ignore")

from scipy import stats
from scipy.stats import norm,skew
import folium 
pd.options.display.float_format = '{:.2f}'.format

sns.set_style("whitegrid")
%matplotlib inline

In [None]:
path=('../input/images/House.jpg')
display(Image.open(path))

> # **Thanks a lot for passing by..**.  
- In this notebook I am analyzing **California House Dataset**, I applied some techniques (I'ver learned few weeks ago) such as **GridSearchCv, RandomsearchCv , OutlierDetection** , ***StratifiedShuffleSplit*** and others with a simple set 
- There were only 200 Nan Values but thanks to loops and Dicctionaty interaction I was able to find the average (between specific range) to fill out the missing Values 
- I am evaluating Linear regression, XGBboost Regressor , Decision tree Regressor , Support Vecor Machine Regressor and Random Forest Regressor and I will use Neural Network Soon..!!
- I am using the OSEMN Methodology 
> - LET'S START

In [None]:
path=('../input/images-2/data.png')
display(Image.open(path))

# 1. Obtaining Data  

In [None]:
df = pd.read_csv("../input/california-housing-prices/housing.csv")
df.tail(10)

In [None]:
df.describe()

# 2. Scrubing - Cleaning, Filling and Formating  

### 2.1. Filling 
- In this section first I found the Missing Values, then I noticed some relation btwn Totalbedrooms ~ TotalRooms - "Please check the Correlation chart" , So what I found the Nan Valeus , checked the Totalbedrooms and find the averge according to the same # in Total_bedrooms, Then I Create a Dicctionary and Repalce the Values. The  15 NAN values represent 15 range of non Found values in TotalBedrooms.

In [None]:
fig,(ax1) = plt.subplots(1, figsize=(10,5))

sns.heatmap(df.isnull(), yticklabels = False , cmap = 'plasma', ax = ax1).set_title("Missing Values")
print("Mssing Values")

In [None]:
# --->  Finding Missing Values 

Missing_values=df[df.isna().any(axis=1)].sort_values(by='total_rooms')['total_rooms'].values
                                                                                         
#--> iterating to get the mean Values 

TB = [] #< -- Here total Bedroom
MV = [] #< -- Here Mean values 

for i in Missing_values:
    values = df[df['total_rooms'] == i]['total_bedrooms'].mean()
    values= round(values,1)
    TB.append(i)
    MV.append(values)
    
#--> Creating Dicctionaty to Group the final Values

Key = TB
VAL = MV
dic = dict(zip(Key,VAL)) # In this dictionaty we have Nan Values 

#--> Eliminating Nan Values from Dicctionaty
new_dic = {k : v for k,v in dic.items() if pd.Series(v).notna().all()}
T_nan_values =len(dic)-len(new_dic)

# Total Nan Values 
print ("Total Nan Values in dict =",T_nan_values)

In [None]:
#--> Replacing Values 

for i, j in new_dic.items():
    df.loc[(df['total_rooms'] == i) & (df['total_bedrooms']!= i), 'total_bedrooms'] = j 
    #find Values in Total roms that = i and total bedrooms == nan and repace them by J.value
    
df[df.isnull().any(axis = 1)] # Excatly the 15 Nan Values 

In [None]:
value = np.mean(df.total_bedrooms)
df.total_bedrooms.fillna(value =value, inplace = True)

### 2.2. Formating 
-  Enconding Variable with N categories with their names Using Onehotencoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohc= OneHotEncoder()
ohe=ohc.fit_transform(df.ocean_proximity.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(ohe ,columns=["Ocean_"+str(ohc.categories_[0][i])
                                     for i in range(len(ohc.categories_[0]))])

data =pd.concat([df,dfOneHot],axis=1)


data.tail(3)

# 3. Explore - Finding significant Pattern and Trends

### 3.1. Folium - Real Location 
-  Using Folium Library to Locate the Houses , Age and Prices (only the First 5000 Houses) 

In [None]:
#Creating Map 
USA = folium.Map(location = [37.880,-122.230],tiles='OpenStreetMap',
                   min_zoom = 6 , max_zoom = 13 , zoom_start = 7)

# Adding Position 
for (index,row) in data[0:5000].iterrows():
    folium.Circle(
        radius = int(row.loc['median_house_value'])/10000,
        location = [row.loc['latitude'], row.loc['longitude']],
        popup = 'House Age ' + str(row.loc['housing_median_age']), color = 'crimson',
        tooltip =  '<li><bold>Price :' + str(row.loc['median_house_value']) + str('K'),
        fill = True, fill_color ='#ccfa00').add_to(USA) 
    
display(USA)

### 3.2. Heatmap - Correlation
- As you might notices Total_rooms ~ Total_bedrooms and Total_population are strongly Correlated , the best apporach would have been Find the strong relation among then and compare them to the Target variable "Media_house_values" to reduce dimensionality (variables) but in this case I didnt do it because The set have few columns -variable so it is understandable instead i will create other Combination maybe i can get better resutls, However you can check my notebook " House price - advance Regression -  Where I have an example

In [None]:
# Correlation 
correlation = data.corr()
f,ax =plt.subplots(figsize =(15,10))
mask = np.triu(correlation)
sns.heatmap(correlation, annot=True, mask=mask , ax=ax, 
            linewidths = 4, cmap = 'viridis', square=True).set_title("Correlation")
bottom,top = ax.get_ylim()
ax.set_ylim (bottom + 0.5 , top - 0.5)
print("Heatmap - Correlation")

* Now I am interesting in knowing the average of bedrooms / Household , the average population / Household *** The Following code was subtracted from the book "On hand in Machine Learning and SkLearn" ***

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms,bedrooms,population,house= 3,4,5,6

class CombiAttri (BaseEstimator, TransformerMixin):
    def __init__(self, add_bedroom_per_room = True):
        self.add_bedroom_per_room = add_bedroom_per_room
    
    def fit(self, X, y = None):
        return self
         
    def transform(self, X):
    #Divide # of rroms / # of houses
        RPH = X[:,rooms] / X[: , house] #Rooms per House
        PPH = X[:,population] / X[: , house] #Population per House 
    
        #Beddrooms per rooms
        if self.add_bedroom_per_room:
            ABR = X[: , bedrooms] / X [: , rooms]
            return np.c_[X, RPH ,PPH, ABR]
        else:
            return np.c_[X, RPH ,PPH]
        
#Running
others= CombiAttri()
extradata = others.transform(data.values)

#Showing
fdata = pd.DataFrame(
    extradata,
    columns = list(data.columns) + ["rooms_per_household", "population_per_household","bedrrom_per_room"], #adding the New columns
    index = data.index)

fdata.head()

In [None]:
fdata.drop('ocean_proximity', axis = 1 , inplace = True)
fdata = fdata.astype(float)

### 3.3. Analizing Target Variable "Media_House_value"
- In here We can notice it has tail to the Right and several Values > 500K that Will affect the model because it will have bias, the best option i can think about would be 1. normalize 2. Drop some values 

TARGET 

In [None]:
def mul_plot (df, feature):
    fig=plt.figure(constrained_layout = True , figsize = (12,8))
    grid= gridspec.GridSpec(ncols = 3 , nrows = 2 , figure=fig)

    ax1= fig.add_subplot(grid[0,1:3])
    ax1.set_title("Histogram")
    sns.distplot(df.loc[:,feature], norm_hist = True, ax= ax1)

    ax2= fig.add_subplot(grid[1,1:3])
    ax2.set_title("QQ_plot")
    stats.probplot(df.loc[:,feature] , plot=ax2)

    ax3= fig.add_subplot(grid[:2,0])
    ax3.set_title("Box Plot")
    sns.boxplot(df.loc[:,feature], orient = "v" , ax= ax3)
    
    print("Skewness: "+ str(fdata['median_house_value'].skew().round(3))) 
    print("Kurtosis: " + str(fdata['median_house_value'].kurt().round(3)))

mul_plot (fdata,'median_house_value')

In [None]:

indexes_2 = fdata.loc[(fdata['median_house_value']>100000) & (fdata['median_income']>=10)].index.to_list()
indexes_3 = fdata.loc[(fdata['median_house_value']>280000) & (fdata['median_income']<=2)].index.to_list()
indexes_4 = fdata.loc[(fdata['median_house_value']<280000) & (fdata['median_income']>=8)].index.to_list()
indexes_4 = fdata.loc[fdata['median_income']>=9].index.to_list()
indexes_1 = fdata.loc[fdata['median_house_value']>500000].index.to_list()

total_drop = indexes_1+indexes_2+indexes_3+indexes_4

In [None]:
fdata.reset_index(drop=True)
fdata.drop(total_drop, inplace= True)
fdata.reset_index(drop=True)

OUTLIERS 

- ***General Outliers***
- **LocalOutlierFactor** which use Knn to find outliers ... here you can have a great explanation https://www.youtube.com/watch?v=vnoBkTa7arI&t=14s

In [None]:
from sklearn.neighbors import LocalOutlierFactor

def outliers (x,y, top = 5 , plot = True):
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    x_ =np.array(x).reshape(-1,1)
    preds = lof.fit_predict(x_)
    lof_scr = lof.negative_outlier_factor_
    out_idx = pd.Series(lof_scr).sort_values()[:top].index
    
    if plot:
        f, ax = plt.subplots(figsize=(9, 6))
        plt.scatter(x=x, y=y, c=np.exp(lof_scr), cmap='RdBu')
    return out_idx

outs = outliers(fdata['median_house_value'], fdata['median_income'],top=5)
print("Outliers detected:",outs)
plt.show()

DROPPING VALUES AND NORMALIZING

In [None]:
''' Normalizing '''

#--- Appliying Log10  = np.log1p()
fdata['median_house_value'] = np.log1p(fdata['median_house_value'])

#Creating new plot 
mul_plot (fdata,'median_house_value')

In [None]:
fdata.reset_index(drop=True)

# 4. Model - Machine Learning 

In [None]:
path=('../input/images/machine.gif')
display(Image.open(path))

MODELS & SPLITTING

In [None]:
#----> Applying Machine Learning 
from sklearn import preprocessing 
from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import StratifiedShuffleSplit,cross_val_score
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score

In [None]:
# ------> Chossong the best model 
def Evaluating (model,X,Y, CV,Criteria,sqrt=True ):
    
    if sqrt :
        score = cross_val_score (model,X, Y, cv=CV ,scoring=Criteria)
        final = -score.mean()
        final = round(np.sqrt(final),2)
        MSE.append(final)
    else:
        score = cross_val_score (model,X, Y, cv=CV ,scoring=Criteria)
        final = round(score.mean(),2)
        R2.append(final)

# Machine Learning Algorithms + Cross_Validation

In [None]:
#-----> Categorizing 
fdata['income_cat'] = pd.cut(fdata["median_income"],
                             bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                             labels=[1, 2, 3, 4, 5])

#------> Statify according to income data to have a proporcional distribution 
# ------> Variable 

criteria = fdata['income_cat'] 
X = fdata.drop("median_income", axis = 1)
y = fdata["median_income"]


# -----> Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = criteria)
X_train.drop("income_cat",axis = 1 , inplace = True)
X_test.drop("income_cat",axis = 1 , inplace = True)

# ------> StandarScaler
MX = MinMaxScaler()
X_train = MX.fit_transform(X_train)
X_test = MX.fit_transform(X_test)

X=X.values
y=y.values

In [None]:
# ------> Algorithms before Tunning 
LN = LinearRegression()
SGDR = SGDRegressor()
DT = DecisionTreeRegressor()
RDF = RandomForestRegressor()
SVR_rbf = SVR()
XR = XGBRegressor()

MSE = [np.nan]  #Mean Squared Error
R2= [np.nan]


# ------> Running models
for i in (LN , SGDR, DT, RDF,SVR_rbf,XR):
    Evaluating(i,X,y,4,'neg_mean_squared_error')

for i in (LN , SGDR, DT, RDF,SVR_rbf,XR):
    Evaluating(i,X,y,4,'r2',sqrt= False)


Best_model = pd.DataFrame(data = {'MSE':MSE,'R2':R2},
             index =['Neural_Network','Linear Regression','Stochastic Gradient Descent',
                     'DecisionTreeRegressor','RandomForestRegressor',
                     'Support Vector Machine','XGB Regressor'])
Best_model.sort_values(by = 'MSE' , ascending=True) 

# ** Hyperparameter** +  ** Cross_Validation** 

In [None]:
X = fdata.drop("median_income", axis = 1)
X.drop('income_cat', axis= 1 , inplace = True)
y = fdata["median_income"]

X= X.values
y=y.values

In [None]:
## -----------------------------> Support Vector Machine
SVR_rbf = SVR()
parameters = {'kernel': ('linear', 'rbf','poly'), 'C':[0.00,0.01,0.1,1],'gamma': [0.001,0.01,0.1,1],'epsilon':[0.1,0.2,0.3]}
SVR_RAND = RandomizedSearchCV(SVR_rbf,parameters,cv=5,n_iter = 50,scoring = 'neg_mean_absolute_error',n_jobs = -1,
                              verbose = 5,return_train_score=True,random_state=42)

## -----------------------------> Stochastic Gradient Descent
SGDR = SGDRegressor()
params={'alpha':[0.1,0.01,0.001,0.0001,0.00001],'learning_rate':['constant','optimal','invscaling','adaptive'],
        'max_iter':[100,300,600,1000,1200,1500,2000],'penalty':['l2','l1','elasticnet']}
SGDR_random_cv = RandomizedSearchCV(estimator = SGDR,param_distributions=params,cv=5,n_iter = 50,
                              scoring = 'neg_mean_absolute_error',n_jobs = 3,verbose = 5,return_train_score=True,random_state=42)

## -----------------------------> XGB Regressor

# ------> Tunnig XGBRegressor
XR = XGBRegressor()

# ------> Hyper Parameter Optimization
n_estimators = [100,500,900,1100,1500]
max_depth = [2,3,5,10,15]
booster = ['gbtree', 'gblinear']
learning_rate = [0.05,0.1,0.15,0.20]
min_child_weight = [1,2,3,4]
base_score = [0.25,0.5,0.75,1]

# ------> Define the grid of Hyperparameters to search
hyperparameter_grid = { 'n_estimators': n_estimators,'max_depth': max_depth,'booster': booster,
                       'learning_rate': learning_rate,'min_child_weight': min_child_weight,
                       'base_score' : base_score}

XR_random_cv = RandomizedSearchCV(estimator = XR,param_distributions=hyperparameter_grid,cv=5,n_iter = 50,scoring = 'neg_mean_absolute_error',n_jobs = 3,
                              verbose = 5,return_train_score=True,random_state=42)

## -----------------------------> RandomForestRegressor
RDF = RandomForestRegressor()
parameters={'n_estimators': [30,10,30], 'max_features': [2,4,6,8],'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]}
clf_RDF = GridSearchCV(RDF, parameters , cv = 5 , scoring = 'neg_mean_squared_error', return_train_score = True)


In [None]:
## ----> Best estimatorBest estimator 
BSVR = SVR(C=1, gamma=1, kernel='poly')
BSGDR= SGDRegressor(alpha=0.1, learning_rate='adaptive')
BRF = RandomForestRegressor(bootstrap=False, max_features=4, n_estimators=10)
BXR = XGBRegressor(base_score=1, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.15, max_delta_step=0, max_depth=5,
             min_child_weight=2, missing=None, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
Model = [BSGDR,BRF,BXR]#,BSVR]
MSE = [np.nan,np.nan]  #Mean Squared Error
R2= [np.nan,np.nan]  #R2

for i in (BSGDR,BRF,BXR):
    Evaluating(i,X,y,4,'neg_mean_squared_error')

for i in (BSGDR,BRF,BXR):
    Evaluating(i,X,y,4,'r2',sqrt= False)


Best_model_2 = pd.DataFrame(data = {'MSE':MSE,'R2':R2},
             index =['Neural_Network','Support Vector Machine','Stochastic Gradient Descent','RandomForestRegressor','XGB Regressor'])
Best_model_2.sort_values(by = 'MSE' , ascending=True)

 # ** Neural Network** 

In [None]:
pip install livelossplot

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Dropout
#from tensorflow.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD
from livelossplot import PlotLossesKeras


#Modeling
model = Sequential()
model.add(Dense(16, input_dim = (16),  kernel_initializer='normal', activation  = 'relu'))
model.add(Dense(4,  kernel_initializer='normal', activation  = 'relu'))

#compiling
model.add(Dense(1, kernel_initializer='normal', activation = 'sigmoid'))
model.compile(loss = 'mean_squared_error',optimizer = 'adam',metrics = ['mse'])

model.fit(X_train,y_train,validation_data =(X_test,y_test),
          epochs = 100, batch_size= 1000, callbacks=[PlotLossesKeras()],verbose=0)

In [None]:
NN_model = model.predict(X_test)
print("The Mean Square Error using NN is ", round(np.sqrt(mean_squared_error(y_test,NN_model)),4))

 # *** Neural Network + K Fold ***

In [None]:
from sklearn.model_selection import KFold
from sklearn import metrics

x_main, x_holdout, y_main, y_holdout = train_test_split(X, y, test_size=0.10) 

# Cross-validate
kf = KFold(5)

oos_y = []
oos_pred = []
hist = []
fold = 0

for train, test in kf.split(x_main):        
    fold+=1
    print(f"Fold #{fold}")
        
    x_train = x_main[train]
    y_train = y_main[train]
    x_test = x_main[test]
    y_test = y_main[test]

    #Modeling
    model = Sequential()
    model.add(Dense(16, input_dim = (16),  kernel_initializer='normal', activation  = 'relu'))
    model.add(Dense(4,  kernel_initializer='normal', activation  = 'relu'))

    #compiling
    model.add(Dense(1, kernel_initializer='normal', activation = 'sigmoid'))
    model.compile(loss = 'mse',
                optimizer = 'adam',
                 metrics = ['mse'])

    model.fit(x_train,y_train,validation_data =(x_test,y_test), epochs = 100, batch_size= 1000)
    history = pd.DataFrame(model.history.history)
    hist.append(history)
    pred = model.predict(x_test)
   
    oos_y.append(y_test)
    oos_pred.append(pred) 

    # Measure accuracy
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print(f"Fold score (RMSE): {score}")
    
# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print()
print(f"Cross-validated score (RMSE): {score}")    
    
# Write the cross-validated prediction (from the last neural network)
holdout_pred = model.predict(x_holdout)

score = np.sqrt(metrics.mean_squared_error(holdout_pred,y_holdout))
print(f"Holdout score (RMSE): {score}")

In [None]:
print(f"Cross-validated score (RMSE): {score}")
print(f"Holdout score (RMSE): {score}")

In [None]:
fig, axs = plt.subplots(2, 2, figsize= (20,15))
axs[0, 0].plot(hist[0])
axs[0, 0].set_title('K_Fold_1')
axs[0, 1].plot(hist[1], 'tab:orange')
axs[0, 1].set_title('K_Fold_2')
axs[1, 0].plot(hist[2], 'tab:green')
axs[1, 0].set_title('K_Fold_3')
axs[1, 1].plot(hist[3], 'tab:red')
axs[1, 1].set_title('K_Fold_4')

for ax in axs.flat:
    ax.set(xlabel='EPOCHS', ylabel='MSE')

# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
    ax.label_outer()

# ***Conclusion***

### This "Kernel" is only for practicing purpose:
* After Hyperparameter tunning the model seems to be overfitted 
* Xgboost Regressor and NN regressor have the Best Performance
* K fold + NN codes are from @Jeff Heaton
* you can find the code on his video : https://www.youtube.com/watch?v=maiQf8ray_s
