In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import seaborn as sns

import itertools

import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.linear_model import Lasso, LogisticRegression # For LASSO & Logistic 
from sklearn.metrics import mean_squared_error # For evaluation
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor   #Decision Tree Regressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier

import warnings # Suppress warnings because they are annoying
warnings.filterwarnings('ignore') 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/california-housing-prices/housing.csv')
data = data.dropna(axis = 0) #remove missing values 
print('Final data has ',data.shape[0],'rows and ',data.shape[1],' columns') 
data = pd.get_dummies(data, columns = ['ocean_proximity'], drop_first = True) # create dummy variables for categorical variable

# #Define X & Y variables for model building
X = data.copy().drop(['median_house_value'], axis =1 )
y_cont = data['median_house_value'] 

print('######################### LINEAR REGRESSION #########################')
X_int = sm.add_constant(X)
linreg = sm.OLS(y_cont, X_int).fit()
print(linreg.summary())

# Linear Regression Model Analysis
# Has Rsq of 0.64. Ok model
# All non Categorical variables are significant. Only categorical variable
# ocean_proximity_NEAR BAY has a igh p value 0.039 implying that this
# variable is not significant when compared to ocean_proximity base case (<1H OCEAN)
# housing_median_age has the biggest impact on median_house_values, followed by
# total_bedrooms, households and population in decreasing order

In [None]:
# Model Assessment
print(data.corr()) #check correlation between variables 

plt.scatter(linreg.fittedvalues, linreg.resid) #Residual vs fitted value plot
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted Values')
plt.show()

# # QQ Plot
qqplot = sm.qqplot(linreg.resid)
plt.show()

#There's a high correlation between longitude and latitude and longitude and ocean_proximity_NEAR BAY[]
Clearly the residual plot has a non random pattern, which shows the data is not linear.
This can be further confirmed by looking at it's QQ plot, which is not a straight line. 
Since the residual plot is concentrated to the top right, it shows non linear relationship between the variables. According to Tukey's transformation of variables, the residual graph pattern suggests 
X (preferable), increasing the power of variable to nth degree might result in a better looking residual graph. value of n is decided by trying various values. 
Let's check which variables do not have a linear relationship and need transformation with plots below 


In [None]:
data['median_house_cat'] = np.where(data['median_house_value']< np.median(data['median_house_value']),0,1)
y_cat = data['median_house_cat']
X = data.copy().drop(['median_house_value', 'median_house_cat'], axis =1 )

print('######################### LOGISTIC REGRESSION #########################')

logit = LogisticRegression()
logit.fit(X,y_cat)
print(logit.intercept_)
print(pd.DataFrame(zip(logit.coef_, X.columns)))


Interpreting coeficients: 

 longitude: A unit change in longitude is associated with decrease in the odds of getting median house value greater than meadian value, decrease by -2.681e+04 times

housing median age: A unit change in housing median age is associated with increase in the odds of getting median house value greater than median value 1072.5200 times

population: A unit change in housing median age is associated with decrease in the odds of getting median house value greater than median value -37.9691 times


In [None]:
print('######################### PREDICTIONS #########################')
# Splitting data into train_valid, test data set with 20% test set
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y_cont, test_size=0.2, random_state = 283)

# # #Further splitting the data into train and valid set
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.37, random_state = 283) #further splitting into train and validation

print('######################### LASSO MODEL #########################')
# Use StandardScalar to scale values to perform lasso regularisation
ss = StandardScaler().fit(X_train)
X_train_lasso = pd.DataFrame(ss.transform(X_train))
X_valid_lasso = pd.DataFrame(ss.transform(X_valid))

# # Set up lambda/alpha candidate values 
alphas = np.logspace(-10, 10, 21) # We will use lambda on powers of 10 scale
Validation_Scores = []
for a in alphas:
    lm = linear_model.Lasso(alpha=a)
    lm.fit(X_train_lasso, y_train) # Fit model on training set
    Validation_Scores.append(metrics.mean_squared_error(lm.predict(X_valid_lasso), y_valid)) # Evaluate model on validation set
# Find the minimum validation error, and it's minimizer
min_alpha = alphas[np.argmin(Validation_Scores)]
print('Best Alpha value is ', min_alpha)

ss = StandardScaler().fit(X_train_valid)
X_train_valid_lasso = pd.DataFrame(ss.transform(X_train_valid))
X_test_lasso = pd.DataFrame(ss.transform(X_test))
# # # Refit Lasso model with selected alpha value
lm = linear_model.Lasso(alpha = min_alpha)
lm.fit(X_train_valid_lasso, y_train_valid)
print(pd.DataFrame(zip(lm.coef_,X.columns)))
print("The MSE on the test set is", metrics.mean_squared_error(lm.predict(X_test_lasso), y_test))


In [None]:
print('######################### RANDOM FOREST #########################')
#Tuning depth and number of trees using gridsearch
n_estimators = [50,100,150,200,250,300]
max_depth= range(10,20)
# features = range(1, 14)
hyperparameter_triplets = list(itertools.product(n_estimators, max_depth)) #create unique triplet pairs using itertools.product
validation_scores = [] #initialise list to store MSE validation scores
for index, triplets in enumerate(hyperparameter_triplets): #iterate on each hyperparameter triplet 
    rf = RandomForestRegressor(n_estimators = triplets[0], max_depth = triplets[1]) #Build RandomForest model for the triplets
    rf.fit(X_train, y_train) # Fit model on training set
    mse = metrics.mean_squared_error(rf.predict(X_valid), y_valid) #Calculate MSE on Validation set
    validation_scores.append(mse)

best_triplet = hyperparameter_triplets[np.argmin(validation_scores)]
print('Final Tunes Parameters are : n_estimators: ', best_triplet[0], 'max_depth: ', best_triplet[1])
rf = RandomForestRegressor(n_estimators = best_triplet[0], max_depth = best_triplet[1])
rf.fit(X_train_valid, y_train_valid)
print('MSE of Random Forest Model on Test set is ',mean_squared_error(rf.predict(X_test), y_test))
print(sorted(zip(rf.feature_importances_,X.columns.values), reverse = True))

print('######################### COMPARING LASSO & RANDOM FOREST #########################')
print('Final Tunes Parameters are : n_estimators: ', best_triplet[0], 'max_depth: ', best_triplet[1],'best features: ',best_triplet[2])


print('MSE of Random Forest Model on Test set is ',mean_squared_error(rf.predict(X_test), y_test))

Best Alpha for Lasso is 100 & MSE Lasso is 4689741469 

Best n_estimators: 150 & max_depth: 19 & MSE Random Forest is 2545876926.0074525 which is almost half of Lasso's
According to Random forest Median Income is the most important variable(0.48), In Lasso it has the highest coeficient value of 74127

Second is ocean_proximity_INLAND, in lasso it has a coef of -19856.644608

Followed by longitude and latitude, with lasso's regression coef values of -47738.5 and -48245 respectively. These are the most sentisite coefs

In [None]:

print('######################### KNN CLASSIFIER MODEL #########################')
# Splitting data into train_valid, test data set with 20% test set
data['median_house_cat'] = np.where(data['median_house_value']< np.median(data['median_house_value']),0,1)
y_cat = data['median_house_cat']
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y_cat, test_size=0.2, random_state = 283)

n_neighbors = np.arange(1,21,3)
leaf_size = list(range(1,50))
kfold = KFold(5, False) 
hyperparameter_couple = list(itertools.product(n_neighbors, leaf_size))
valid_precision = []
for index, couple in enumerate(hyperparameter_couple): #iterate on each hyperparameter triplet 
    valid_precision_tmp = []
    for train_index, valid_index in kfold.split(X_train_valid): #splitting into train, valid set
        X_train, Y_train = X_train_valid.iloc[train_index], y_train_valid.iloc[train_index] # Training set
        X_valid, Y_valid = X_train_valid.iloc[valid_index], y_train_valid.iloc[valid_index] # Validation set
        ss = StandardScaler().fit(X_train)
        X_train = pd.DataFrame(ss.transform(X_train))
        X_valid = pd.DataFrame(ss.transform(X_valid))
        
        knn = neighbors.KNeighborsClassifier(n_neighbors = couple[0], leaf_size = couple[1])
        knn.fit(X_train, Y_train)
        y_hat = knn.predict(X_valid)
        score = metrics.precision_score(Y_valid, y_hat)
        valid_precision_tmp.append(metrics.precision_score(Y_valid, y_hat))
    valid_precision.append(np.mean(valid_precision_tmp))

best_couple = hyperparameter_couple[np.argmax(valid_precision)+1]
bestK = best_couple[0]
best_leaf_size = best_couple[1]

# # Calculating final precision on the Tesing Set
# # KNN Scale Data
ss = StandardScaler().fit(X_train_valid)
X_train_valid_knn = pd.DataFrame(ss.transform(X_train_valid))
X_test_knn = pd.DataFrame(ss.transform(X_test))

knn = neighbors.KNeighborsClassifier(n_neighbors = bestK, leaf_size =  best_leaf_size)
knn.fit(X_train_valid_knn, y_train_valid)
y_hat = knn.predict(X_test_knn)
score = metrics.precision_score(y_test, y_hat)
print('Precision Score of KNN Classifier is ',score) # ~ 0.90

print('######################### DECISION TREE MODEL #########################')
max_depth = list(range(2,50))
leaf_nodes = list(range(2,50))
kfold = KFold(5, False) 
hyperparameter_couple = list(itertools.product(max_depth, leaf_nodes))
valid_precision = []
for index, couple in enumerate(hyperparameter_couple): #iterate on each hyperparameter triplet 
    valid_precision_tmp = []
    for train_index, valid_index in kfold.split(X_train_valid): #splitting into train, valid set
        X_train, Y_train = X_train_valid.iloc[train_index], y_train_valid.iloc[train_index] # Training set
        X_valid, Y_valid = X_train_valid.iloc[valid_index], y_train_valid.iloc[valid_index] # Validation set
        tree_clf = DecisionTreeClassifier(max_depth = couple[0], max_leaf_nodes = couple[1])
        tree_clf.fit(X_train, Y_train)
        y_hat = tree_clf.predict(X_valid)
        score = metrics.precision_score(Y_valid, y_hat)
        valid_precision_tmp.append(metrics.precision_score(Y_valid, y_hat))
    valid_precision.append(np.mean(valid_precision_tmp))

best_couple = hyperparameter_couple[np.argmax(valid_precision)+1]
best_max_depth = best_couple[0]
best_leaf_node = best_couple[1]

# # # Calculating final precision on the Tesing Set
tree_clf = DecisionTreeClassifier(max_depth= best_max_depth, max_leaf_nodes= best_leaf_node)
tree_clf.fit(X_train_valid, y_train_valid)
y_hat = tree_clf.predict(X_test)
score = metrics.precision_score(y_test, y_hat)
print('Precision Score of KNN Classifier is ',score) # ~ 0.91

print('######################### BOOSTED TREE MODEL #########################')
max_depth = list(range(2,6))
n_estimators =  np.linspace(100, 500, 10, dtype = int)
kfold = KFold(5, False) 
hyperparameter_couple = list(itertools.product(max_depth, n_estimators))
print("Size is ", len(hyperparameter_couple))
valid_precision = []
iter = 0
for index, couple in enumerate(hyperparameter_couple): #iterate on each hyperparameter triplet 
    valid_precision_tmp = []
    for train_index, valid_index in kfold.split(X_train_valid): #splitting into train, valid set
        X_train, Y_train = X_train_valid.iloc[train_index], y_train_valid.iloc[train_index] # Training set
        X_valid, Y_valid = X_train_valid.iloc[valid_index], y_train_valid.iloc[valid_index] # Validation set
        gbc_clf = GradientBoostingClassifier(max_depth= couple[0], n_estimators = couple[1])
        gbc_clf.fit(X_train, Y_train)
        y_hat = gbc_clf.predict(X_valid)
        score = metrics.precision_score(Y_valid, y_hat)
        valid_precision_tmp.append(metrics.precision_score(Y_valid, y_hat))
    valid_precision.append(np.mean(valid_precision_tmp))
    iter += 1
    print("Done ", iter)

best_couple = hyperparameter_couple[np.argmax(valid_precision)]
best_max_depth = best_couple[0]
best_n_estimator = best_couple[1]

#Tesing Set
tree_clf = DecisionTreeClassifier(max_depth= best_max_depth, n_estimators = best_n_estimator )
tree_clf.fit(X_train_valid, y_train_valid)
y_hat = tree_clf.predict(X_test)
score = metrics.precision_score(y_test, y_hat)
print('Precision Score of BOOSTED TREE MODEL is ',score) # ~ 0.

# print('######################### PCA #########################')

data = data_orig.dropna(axis = 0)
data = pd.get_dummies(data, columns = ['ocean_proximity']) 

X = data.copy().drop(['median_house_value'], axis =1 )
y_cont = data['median_house_value']

X_scale = StandardScaler().fit_transform(X)
pca = PCA(0.9) # Use enough PC to capture 90% of the variability
pca.fit(X_scale) 
X_trans = pca.transform(X_scale)
print(X_trans.shape[1], ' principal components are needed to cover 90% variability for this data') 
# 7 principal components are needed to cover 90% variability for this data

#Scree Plot 
plt.plot(range(1, 8), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Proportion of Variance Explained')
plt.show()