<a href="https://colab.research.google.com/github/starkjones/NYC-CONDOMINIUM-COMPARABLE-RENTAL-INCOME-ANALYSIS/blob/main/NYC_Condo_Comp_RI__Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **NYC CONDOMINIUM COMPARABLE RENTAL INCOME ANALYSIS**

Jonathan Jones

22.05.05

In [None]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

## **PRIMARY DATASET**

In [None]:
data = '/content/drive/MyDrive/SI/DOF__Condominium_Comparable_Rental_Income___Manhattan___FY_2009_2010_ML.xlsx'
df = pd.read_excel(data)
df.head(5)

## **PRIMARY DATASET**

In [None]:
# Shape of data frame:

df.shape

# 1068 rows 
# 46 columns

In [None]:
# # Unadulterated copy for Machine Learning:

df_ML = df.copy()

In [None]:
# Changing all column names to lowercase:

df.columns = df.columns.str.lower()

In [None]:
# Preliminary check of column names, datatypes and entry quantities:

df.info()

##**DATA CLEANING & PREPARATION**

In [None]:
# Check for duplciated rows:

df.duplicated().sum()

In [None]:
df.nunique()

In [None]:
# Removal of ineffectual columns:
remove = {'borough','manhattan condominium property boro-block-lot', 
          'comparable rental 2  boro-block-lot','bin', 'bbl',
          'manhattan condominium property condo section', 
          'census tract', ' comparable rental 1 boro-block-lot', 'postcode', 'community board'}
df.drop(columns = remove, inplace = True)

In [None]:
# Check for missing values:

df.isna().sum()

Imputation Strategy: 
  

missing 15 / 1068 or 01.40 % of entries
missing 23 / 1068 or 02.15 % of entries

postcode                                                  15
latitude                                                  15
longitude                                                 15
community board                                           15
council district                                          15
census tract                                              15
bin                                                       23
bbl                                                       23
nta                                                       15

reviews per month: missing 57 / 1068 or 05.33 % of its entries

comparable rental 2  boro-block-lot                       57
comparable rental 2  address                              57
comparable rental 2  neighborhood                         57
comparable rental 2  building classification              57
comparable rental 2  total units                          57
comparable rental 2  year built                           57
comparable rental 2  gross sqft                           57
comparable rental 2  est. gross income                    57
comparable rental 2  gross income per sqft                57
comparable rental 2  full market value                    57
comparable rental 2  market value per sqft                57
comparable rental 2  dist. from coop in miles             57


In [None]:
# import re

# column = df['comparable rental 2  gross sqft']

# for i in column:
#   if i == int:
#     'ignore'
#   else:
#     i.replace({',':'', "'":""}, regex = True)
#     # re.sub(pattern = '\w', repl = '', string = i)



In [None]:
# Fill Values and Lists:
lon = df['longitude'].mean()
lat = df['latitude'].mean()
cd = round(df['council district'].mean(), 1)

cr2_tu = df['comparable rental 2  total units'].mean()
cr2_yb = df['comparable rental 2  year built'].values[1]
cr2_gs = df['comparable rental 2  gross sqft'].mean()
cr2_egi = df['comparable rental 2  est. gross income'].mean()
cr2_gipsf = df['comparable rental 2  gross income per sqft'].mean()
cr2_fmv = df['comparable rental 2  full market value'].mean()
cr2_mvpsf = df['comparable rental 2  market value per sqft'].mean()
cr2_drc = df['comparable rental 2  dist. from coop in miles'].mean()

# Object Imputation:
df['nta'].fillna('Missing', inplace =True)
df['comparable rental 2  address'].fillna('Missing', inplace =True)
df['comparable rental 2  neighborhood'].fillna('Missing', inplace =True)
df['comparable rental 2  building classification'].fillna('Missing', inplace =True)

# Numeric Imputation:
df['latitude'].fillna(lat, inplace =True)
df['longitude'].fillna(lon, inplace =True)
df['council district'].fillna(cd, inplace =True)

df['comparable rental 2  total units'].fillna(cr2_tu, inplace =True)
df['comparable rental 2  year built'].fillna(cr2_yb, inplace =True)
df['comparable rental 2  gross sqft'].fillna(cr2_gs, inplace =True)
df['comparable rental 2  est. gross income'].fillna(cr2_egi, inplace =True)
df['comparable rental 2  gross income per sqft'].fillna(cr2_gipsf, inplace =True)
df['comparable rental 2  full market value'].fillna(cr2_fmv, inplace =True)
df['comparable rental 2  market value per sqft'].fillna(cr2_mvpsf, inplace =True)
df['comparable rental 2  dist. from coop in miles'].fillna(cr2_drc, inplace =True)


# comparable rental 2  address                              57
# comparable rental 2  neighborhood                         57
# comparable rental 2  building classification              57
# comparable rental 2  total units                          57
# comparable rental 2  year built                           57
# comparable rental 2  gross sqft                           57
# comparable rental 2  est. gross income                    57
# comparable rental 2  gross income per sqft                57
# comparable rental 2  full market value                    57
# comparable rental 2  market value per sqft                57
# comparable rental 2  dist. from coop in miles             57

In [None]:
# Datatype correction:

df['manhattan condominium property gross income per sqft'] = df['manhattan condominium property gross income per sqft'].astype(int)
df['manhattan condominium property market value per sqft'] = df['manhattan condominium property market value per sqft'].astype(int)
df['comparable rental 1  gross income per sqft'] = df['comparable rental 1  gross income per sqft'].astype(int)
df['comparable rental 1  market value per sqft'] = df['comparable rental 1  market value per sqft'].astype(int)
df['comparable rental 1  dist. from coop in miles'] = df['comparable rental 1  dist. from coop in miles'].astype(int)
df['comparable rental 2  total units'] = df['comparable rental 2  total units'].astype(int)
df['council district'] = df['council district'].astype(int)
df['comparable rental 2  year built'] = df['comparable rental 2  year built'].astype(int)
df['comparable rental 2  gross sqft'] = df['comparable rental 2  gross sqft'].astype(int)
df['comparable rental 2  est. gross income'] = df['comparable rental 2  est. gross income'].astype(int)
df['comparable rental 2  gross income per sqft'] = df['comparable rental 2  full market value'].astype(int)
df['comparable rental 2  full market value'] = df['comparable rental 2  full market value'].astype(int)
df['comparable rental 2  market value per sqft'] = df['comparable rental 2  market value per sqft'].astype(int)
df['comparable rental 2  dist. from coop in miles'] = df['comparable rental 2  dist. from coop in miles'].astype(int)

In [None]:
# Final check for missing values:

df.isna().sum()

In [None]:
# Checking value names in object columns for inconsistencies:

datatypes = df.dtypes
object_cols = datatypes[datatypes =='object'].index
object_cols

for column in object_cols:
  print(f'Column Name = {column}')
  print('\n')
  print(df[column].value_counts(dropna=False))
  print('\n\n')

In [None]:
# Check for insconsistencies and odd numeric entries:

df.describe().round(1)

##**VISUAL ANALYSIS**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Correlation map to ascertain related features:

correlation = df.corr().round(2)
c_list = correlation.unstack().sort_values(ascending = False)

# for i in enumerate(c_list):
#   if i == 1.00:
#     'ignore'
#   else:
#     i == 1
#     print(i, column, column)
#     print('\n')

c_list.head(40)

In [None]:
fig, hm = plt.subplots(nrows=1, ncols =1, figsize=(18,18));
hm = sns.heatmap(correlation, cmap='rocket_r', annot=True);
hm.set_title('Correlation Heatmap\n', fontsize= 24);

In [None]:
# Checking the correlation between each feature and target:

correlated_features = df[df.columns[1:]].corr()['manhattan condominium property est. gross income'][:]
correlated_features.round(2).sort_values(ascending = False)

## **Features that share a correlation value of 0.5 or better with the target:**

1.  manhattan condominium property est. gross income        1.00
2.  manhattan condominium property full market value        1.00
3.  manhattan condominium property gross sqft               0.96
4.  manhattan condominium property total units              0.85
5.  comparable rental 2  est. gross income                  0.74
6.  comparable rental 2  gross income per sqft              0.73
7.  comparable rental 2  full market value                  0.73
8.  comparable rental 2  gross sqft                         0.71
9.  comparable rental 2  total units                        0.69
10. comparable rental 1 full market value                   0.64
11. comparable rental 1  est. gross income                  0.64
12. comparable rental 1  gross sqft                         0.55
13. comparable rental 1  total units                        0.53



In [None]:
fig, axes = plt.subplots(nrows=1, ncols =1, figsize=(10,10));
sns.scatterplot(data =df,x='latitude', y= 'longitude', hue = 'manhattan condominium property gross income per sqft');
axes.set_title('Condo Gross Income by Location\n', color= '#081d58', alpha= .8, fontsize =24);
plt.xlabel('\nLatitude', fontsize = 16);
plt.ylabel('Longitude\n', fontsize = 16);
plt.grid('both')


In [None]:
import plotly.express as px

px.scatter_mapbox(df, lat='latitude',lon='longitude',color='manhattan condominium property market value per sqft',
                  mapbox_style="open-street-map")


## Analysis:

A color coded map makes distinctions between areas clear and easy to interpret. The highest market values (represented by warming purple to yellow hues) are concentrated towards the center of the island between mid thirty streets and the mid sixties. Prices appear to decrease linearly as we move further north, away from Central Park's northern bound; Central Park North.

In [None]:
fig, axes = plt.subplots(nrows=1, ncols =1, figsize=(14,12));
neighbors = df['manhattan condominium property neighborhood'].sort_values(ascending =True)
condoloc = sns.barplot(data =df, x= neighbors, y= 'manhattan condominium property gross income per sqft');
plt.title('Neighborhood by Gross Income per Square Foot\n', color= '#081d58', fontsize =24)
plt.xlabel('\nCondominum Neighborhood', fontsize = 16)
plt.ylabel('Gross Income per Square Foot\n', fontsize =16)
plt.xticks(rotation= '90');

## Analysis:

The graph above provides a summary of the most and least expensive neighborghoods on the island of Manhattan. We can see that the Midown Central Business District (CBD) is the most expensive and Inwood at the northern most point of the island is the least expensive. 

In [None]:

fig, axes = plt.subplots(nrows = 1, ncols = 3, figsize = (18,8))
sns.histplot(df['manhattan condominium property full market value'],ax=axes[0], 
             bins=10, color="#6a51a3", 
             legend=True, linestyle='solid', 
             line_kws={'lw': 2, 'color': 'black'});
sns.histplot(df['comparable rental 1 full market value'], ax=axes[1],
             bins=10, color="#807dba", 
             legend=True, linestyle='solid', 
             line_kws={'lw': 2, 'color': 'black'});
sns.histplot(df['comparable rental 2  full market value'], ax=axes[2],
             bins=10, color="#bcbddc", 
             legend=True, linestyle='solid', 
             line_kws={'lw': 2, 'color': 'black'});


In [None]:

fig, axes = plt.subplots(nrows = 1, ncols = 3, figsize = (18,8))
sns.histplot(df['manhattan condominium property est. gross income'], ax=axes[0], 
             bins=10, color="#ec7014", 
             legend=True, linestyle='solid', 
             line_kws={'lw': 2, 'color': 'black'});
sns.histplot(df['comparable rental 1  est. gross income'], ax=axes[1],
             bins=10, color="#fe9929", 
             legend=True, linestyle='solid', 
             line_kws={'lw': 2, 'color': 'black'});
sns.histplot(df['comparable rental 2  est. gross income'], ax=axes[2],
             bins=10, color="#fec44f", 
             legend=True, linestyle='solid', 
             line_kws={'lw': 2, 'color': 'black'});

##**MACHINE LEARNING**

In [None]:
# Preliminary check of the unadulterated dataframe copy:

df_ML.columns = df_ML.columns.str.lower()

df_ML.info()

In [None]:
df_ML.duplicated().sum()

In [None]:
# Datatype correction:

df_ML['manhattan condominium property gross income per sqft'] = df_ML['manhattan condominium property gross income per sqft'].astype(int)
df_ML['manhattan condominium property market value per sqft'] = df_ML['manhattan condominium property market value per sqft'].astype(int)
df_ML['comparable rental 1  gross income per sqft'] = df_ML['comparable rental 1  gross income per sqft'].astype(int)
df_ML['comparable rental 1  market value per sqft'] = df_ML['comparable rental 1  market value per sqft'].astype(int)
df_ML['comparable rental 1  dist. from coop in miles'] = df_ML['comparable rental 1  dist. from coop in miles'].astype(int)
# df_ML['comparable rental 2  total units'] = df_ML['comparable rental 2  total units'].astype(int)
# df_ML['council district'] = df_ML['council district'].astype(int)
# df_ML['comparable rental 2  year built'] = df_ML['comparable rental 2  year built'].astype(int)
# df_ML['comparable rental 2  gross sqft'] = df_ML['comparable rental 2  gross sqft'].astype(int)
# df_ML['comparable rental 2  est. gross income'] = df_ML['comparable rental 2  est. gross income'].astype(int)
# df_ML['comparable rental 2  gross income per sqft'] = df_ML['comparable rental 2  full market value'].astype(int)
# df_ML['comparable rental 2  full market value'] = df_ML['comparable rental 2  full market value'].astype(int)
# df_ML['comparable rental 2  market value per sqft'] = df_ML['comparable rental 2  market value per sqft'].astype(int)
# df_ML['comparable rental 2  dist. from coop in miles'] = df_ML['comparable rental 2  dist. from coop in miles'].astype(int)

In [None]:
# Removal of ineffectual columns:
remove = {'borough','manhattan condominium property boro-block-lot', 
          'comparable rental 2  boro-block-lot','bin', 'bbl',
          'manhattan condominium property condo section', 
          'census tract', ' comparable rental 1 boro-block-lot', 'postcode', 'community board'}
df_ML.drop(columns = remove, inplace = True)

In [None]:
df_ML.info()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')

# Validation Split 

X = df_ML.drop(columns='manhattan condominium property est. gross income')
y = df_ML['manhattan condominium property est. gross income']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
from pandas.core.arrays.sparse import dtype
# Column separation for Encoding:

ord_col = make_column_selector(dtype_include='object')
ord_col(X_train)

**ENCODING STRATEGY:**

**All of the categorical columns for this data set are considered to be nominal type by the Department of Finance as none of their entires are ordered or ranked.**


* manhattan condominium property address

* nta

* manhattan condominium property neighborhood

* manhattan condominium property building classification

* comparable rental 1 address

* comparable rental 1  neighborhood

* comparable rental 1  building classification

* comparable rental 2  address

* comparable rental 2  neighborhood

* comparable rental 2  building classification


In [None]:
# Scalars and One Hot Encoding:
scaler = StandardScaler()

# OHE:
OHE = OneHotEncoder(handle_unknown= 'ignore', sparse = False)

**IMPUTATION**

In [None]:
# Imputation Strategies: 
frequent_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy = 'mean')

In [None]:
# Column / Feature separation:
nominal_columns = ['manhattan condominium property address',
 'nta',
 'manhattan condominium property neighborhood',
 'manhattan condominium property building classification',
 'comparable rental 1 address',
 'comparable rental 1  neighborhood',
 'comparable rental 1  building classification',
 'comparable rental 2  address',
 'comparable rental 2  neighborhood',
 'comparable rental 2  building classification']
numeric_columns = make_column_selector(dtype_include = 'number')

In [None]:
# Creating pipelines:
nominal_pipe = make_pipeline(frequent_imputer, OHE)
numeric_pipe = make_pipeline(mean_imputer, scaler)

# Creating Tuples:
nominal_tuple = (nominal_pipe, nominal_columns)
numeric_tuple = (numeric_pipe, numeric_columns)

In [None]:
# Column Transfromer:
preprocessor = make_column_transformer(nominal_tuple, numeric_tuple, remainder = 'passthrough')

# Fit:

preprocessor.fit(X_train, y_train)

# Transform Data:

X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

## **MODELING**

Our target, Manhattan Condominium Property est. Gross Income, is not limited to a finite number of predetermined classes, therefore this is a regression problem. 

In [None]:
# Import libraries for linear regression models:

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn import set_config
set_config(display='diagram')


In [None]:
# Constructing a function that prints various test metrics when called:
# Adapted from Josh Johnson:

def model_metrics(y_true, y_pred):
  """yields / prints MAE, MSE, RMSE and R2 from target prediction"""
  
  MAE = mean_absolute_error(y_true, y_pred)
  MSE = mean_squared_error(y_true, y_pred)
  RMSE = np.sqrt(MSE)
  R2 = r2_score(y_true, y_pred)

  print(f'SCORES:\n \nMAE: {MAE:,.2f} \nMSE: {MSE:,.2f} \nRMSE: {RMSE:,.2f} \nR2: {R2:.2f}\n')

In [None]:
# Custom function to plot predicitons and evaluate the model:

def plot_r2(X, y, y_pred):
  # True Labels
  plt.scatter(X, y, label = 'True Values')

  # plot predicted labels
  plt.scatter(X, y_pred, label = 'Predicted Values')
  plt.legend()
  plt.xlabel('X')
  plt.ylabel('y')
  plt.show()

  # print R2 score

  print('\nR2 Score', r2_score(y, y_pred))

## **BASELINE REGRESSION MODEL**

In [None]:
# Baseline Dummy Regressor or Baseline Model with mean strategy shall serve as a reference for our models' performance:

DR = DummyRegressor(strategy='mean')

# Model fitting:

DR.fit(X_train, y_train)

# Training prediction arrays: 

dum_train_prediction = DR.predict(X_train)
dum_test_prediction = DR.predict(X_test)

# Evaluation of Baseline Model: 

DR_train_score = model_metrics(y_train, dum_train_prediction)
DR_test_score = model_metrics(y_test, dum_test_prediction)

## **DECISION TREE MODEL**

In [None]:
# Instantiate Decision Tree Model with default hyperparameters:

DT = DecisionTreeRegressor(random_state = 42)

# Model fit:

DT.fit(X_train, y_train)

# Training prediction arrays: 

DT_train_pred = DT.predict(X_train)
DT_test_pred = DT.predict(X_test)


# Evaluation of Default Decision Tree Model: 

DT_train_score = model_metrics( y_train, DT_train_pred)
DT_test_score = model_metrics(y_test, DT_test_pred)

In [None]:
# A look at tunable hyperparameters: 

DT.get_params()

####**Decision Tree Analysis:**

The default decision tree exhibits high variance. We can test a range of max depths to see which value produces the highest test score and smallest errors. 


## **DECISION TREE MODEL 2**

For loop to ascertain optimal max depth :

In [None]:
# Optimal estimators for a Random Forest Model :

# Finding the optimal depth

depths = range(1, DT.get_depth()+1)
DTscores = pd.DataFrame(columns=['Train', 'Test'], index=depths)

# Loop over values in depths:

for depth in depths:
  DTo = DecisionTreeRegressor(max_depth = depth, random_state = 42)
  DTo.fit(X_train, y_train)

# Prediction Arrays:
  DTo_train__prediction = DTo.predict(X_train)
  DTo_test__prediction = DTo.predict(X_test)

# Model Evaluation:
  DTo_R2_train = r2_score(y_train, DTo_train__prediction)
  DTo_R2_test = r2_score(y_test, DTo_test__prediction)

# Add R2 scores to new Data Frame
  DTscores.loc[depth, 'Train'] = DTo_R2_train
  DTscores.loc[depth, 'Test'] = DTo_R2_test
  DTscores.loc[depth, 'max_depth']= depth
  
  DTscores

In [None]:
# Data frame sorted by test scores from highest to lowest:

DTscores.sort_values(by ='Test', ascending = False)

In [None]:
import matplotlib.patches as mpatches

# Plot to visualize Test score and max depth results:
fig, axes = plt.subplots(nrows=1, ncols =1, figsize=(8,8));

plt.plot(depths, DTscores['Train'])
plt.plot(depths, DTscores['Test'])
plt.axvline(x=19, color = 'red');
plt.axhline(y=.97, color = 'purple', alpha = .5, ls = 'dashed');
plt.title('Max Depth vs R2 Score\n')
plt.xlabel('\nMax Depth');
plt.ylabel('R2 Score\n');

test_color = mpatches.Patch(color='orange', label = 'Test Scores')
train_color = mpatches.Patch(color='blue', label = 'Training Scores')

plt.legend(loc ='lower center', handles = [train_color, test_color]);

In [None]:
# Instantiate Decision Tree Model with default hyperparameters:

DT2 = DecisionTreeRegressor(max_depth = 19, random_state= 42)

# Model fit:

DT2.fit(X_train, y_train)

# Training prediction arrays: 

DT2_trainpred = DT2.predict(X_train)
DT2_testpred = DT2.predict(X_test)


# Evaluation of Default Decision Tree Model: 

DT2_train_score = model_metrics( y_train, DT2_trainpred)
DT2_test_score = model_metrics(y_test, DT2_testpred)

In [None]:
# Plotting the Final Decision Tree, DT2:

plt.figure(figsize = (30, 20))
plot_tree(DT2);

#### **Decision Tree 2 Analysis:**

Decision Tree 2 Analysis:
The final model improved significantly. The Mean Absolute Error decreased from 319,372.25 dollars to 232,283.00 dollars and the R2 value increased from 0.89 to 0.97. The two models are also much closer in fit suggesting that we were able to eliminate some of the variance from the previous iteration. 



## **RANDOM FOREST MODEL**

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Instantiate Random Forest Regressor:
RF = RandomForestRegressor()

# Fit pipeline on training data:
RF.fit(X_train, y_train)

# Test predictions
RF_train_pred = RF.predict(X_train)
RF_test_pred = RF.predict(X_test)

# Model Evaluation:
RF_train_score = model_metrics(y_train, RF_train_pred)
RF_test_score = model_metrics(y_test, RF_test_pred)

In [None]:
RF.get_params()

#### **Random Forest Analysis:**

Similar to the initial Decision Tree model the default Random Forest model is a bit overfit. I will use a GridSearch parameter grid to find the best max_depth.  


## **RANDOM FOREST MODEL (GridSearch)**

In [None]:
from sklearn.model_selection import GridSearchCV

# Finding the optimal max depth using GridSearch Cross Validation:

param_grid = {'max_depth': list(range(1,30, 10))}

# Instantiate GridSearch model:

RF_grid_search = GridSearchCV(RF, param_grid)

# Fit model:
RF_grid_search.fit(X_train, y_train)

# Find best hyperparameter combination:
RF_grid_search.best_params_

In [None]:
# Retrieve the best version of the model:
best_model = RF_grid_search.best_estimator_

# Refit the model on the entire training set:
best_model.fit(X_train, y_train)

# Evaluate model on the test set:
best_model.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Instantiate Random Forest Regressor:
RFb = RandomForestRegressor(n_estimators=100)

# Fit pipeline on training data:
RFb.fit(X_train, y_train)

# Test predictions
RFb_train_pred = RFb.predict(X_train)
RFb_test_pred = RFb.predict(X_test)

# Model Evaluation:
RFb_train_score = model_metrics(y_train, RFb_train_pred)
RFb_test_score = model_metrics(y_test, RFb_test_pred)

## **XGBOOST MODEL (300 ESTIMATORS)**

In [None]:
from xgboost import XGBRegressor

# Instantiate XGBoost model with 300 estimators or "weak learner" models:

XGB = XGBRegressor(n_estimators= 300, max_depth = 1)

# Fit the model onto training data:
XGB.fit(X_train, y_train)

# Test predictions:
XGB_test_pred = XGB.predict(X_test) 
# Model Evaluation:
XGB_test_score = model_metrics(y_test, XGB_test_pred)

## **LINEAR REGRESSION MODEL (DEFAULT)**

In [None]:
# Instantiate Linear Regression Model with default hyperparameters:

LR = LinearRegression()

# Creating baseline pipeline:

LR.fit(X_train, y_train)

# Training prediction arrays: 

LR_train_pred = LR.predict(X_train)
LR_test_pred = LR.predict(X_test)

# Evaluation of Baseline Model: 

LR_train_score = model_metrics(y_train, LR_train_pred)
LR_test_score = model_metrics(y_test, LR_test_pred)

## **NEURAL NETWORK MODELING**

In [None]:
# Import Libraries:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras import metrics
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Learning history plotting function:

def plot_history(history):
  """Plots the metrics of a model's learning progression or history"""

  metrics = history.history.keys()

  for metric in metrics:
    if not 'val' in metric:
      plt.plot(history.history[f'{metric}'], label = f'{metric}')
      if f'val_{metric}' in metrics:
        plt.plot(history.history[f'val_{metric}'], label = f'val_{metric}')
        plt.xlabel('\nEpochs')
        plt.title(f'{metric}')
        plt.show()

In [None]:
# Number of columns:

inputshape = X_train.shape[1]
inputshape

In [None]:
# Model instantiation:
sm = Sequential()

# First hidden layer: 
sm.add(Dense(1684, activation = 'relu', input_dim = inputshape))

# Second:
sm.add(Dense(842, activation = 'relu'))

# Third:
sm.add(Dense(420, activation = 'relu'))

# Fourth:
sm.add(Dense(210, activation = 'relu'))

# Fifth:
sm.add(Dense(100, activation = 'relu'))

# Sixth:
sm.add(Dense(50, activation = 'relu'))

# Seventh:
sm.add(Dense(25, activation = 'relu'))

# Output layer:
sm.add(Dense(1, activation = 'linear'))

# Network Summary:
sm.summary()

# Compile model:
sm.compile(optimizer= 'adam', loss = 'mse', metrics= 'mae')

In [None]:
# Model fit:

history = sm.fit(X_train, y_train, 
                 validation_data = (X_test, y_test),
                 epochs = 100,
                 verbose = 0)

In [None]:
# Plot history for model 3:

plot_history(history)

In [None]:
# Model Evaluation:

nnpredict = sm.predict(X_test)

nn_eval = model_metrics(y_test, nnpredict)

In [None]:
from tensorflow.keras.layers import Dropout


# Model instantiation:
sm2 = Sequential()

# First hidden layer: 
sm2.add(Dense(1684, activation = 'relu', input_dim = inputshape))

# Second:
sm2.add(Dense(900, activation = 'relu'))

# Third:
sm2.add(Dense(450, activation = 'relu'))

# Fourth:
sm2.add(Dense(220, activation = 'relu'))

# Fifth:
sm2.add(Dense(110, activation = 'relu'))

# Sixth:
sm2.add(Dense(55, activation = 'relu'))

# Seventh:
sm2.add(Dense(25, activation = 'relu'))

# Eigth:
sm2.add(Dense(12, activation = 'relu'))

#  Nineth:
sm2.add(Dense(6, activation = 'relu'))

# Output layer:
sm2.add(Dense(1, activation = 'linear'))

# Network Summary:
sm2.summary()

# Compile model:
sm2.compile(optimizer= 'adam', loss = 'mse', metrics= 'mae')

In [None]:
# Model 2 fit:

history2 = sm2.fit(X_train, y_train, 
                 validation_data = (X_test, y_test),
                 epochs = 250,
                 verbose = 0)

In [None]:
# Plot history for model 2:

plot_history(history2)

In [None]:
# Model 2 Evaluation:

nn2predict = sm2.predict(X_test)

nn2_eval = model_metrics(y_test, nn2predict)

In [None]:
# Model Evaluation:

# Predictions and model evaluation:
nn2_test_predicitions = sm.predict(X_test)

nn2_test_predictions = nn2_test_predicitions.round().astype(bool)

In [None]:
# Data Frame of Sequencial test predictions:


nn2_pred_df = pd.DataFrame(nn2_test_predicitions, columns= ['Estimated Gross Income'])

nn2_pred_df['Test Scores'] = r2_score(y_test, nn2predict)

nn2_pred_df['MAE'] = mean_absolute_error(y_test, nn2predict)

nn2_pred_df['Index'] = list(range(1,268))

nn2_pred_df.head().sort_values(by= 'Estimated Gross Income', ascending = False)

In [None]:
nn2_pred_df.shape