<a href="https://colab.research.google.com/github/sayarghoshroy/place2crash/blob/main/regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
# Importing required ML model related functionalities

from sklearn import metrics
from sklearn import neighbors
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import KFold

In [3]:
path = 'drive/My Drive/place2crash_data/'

# Building raw datasets

def remove_nans(data):
  # data: list of rows
  index = 0
  count = 0

  while index < len(data):
    flag = 0
    for item in data[index]:
      if type(item) is not str and np.isnan(item) == True:
        flag = 1
        break
    
    if flag == 1:
      del data[index]
      count += 1
    else:
      index += 1

  return count, data

with open(path + 'train_data.json', 'r+') as f:
  train_data = json.load(f)

with open(path + 'test_data.json', 'r+') as f:
  test_data = json.load(f)

train_nans, train_data = remove_nans(train_data)
test_nans, test_data = remove_nans(test_data)
raw_data = train_data + test_data

train = np.asarray(train_data)
test = np.asarray(test_data)
raw = np.asarray(raw_data)

print('Number of rows with NaNs in raw train: ' + str(train_nans))
print('Number of rows with NaNs in raw test: ' + str(test_nans))

Number of rows with NaNs in raw train: 9075
Number of rows with NaNs in raw test: 983


In [4]:
with open(path + 'head_to_col_id.json', 'r+') as f:
  mapping = json.load(f)

# Maps feature name to column ID

In [5]:
one_hot_flag = True

def get_categorizer(feature):
  global one_hot_flag
  
  if one_hot_flag:
    le = preprocessing.OneHotEncoder(sparse = False)
    le.fit(feature.reshape(-1, 1))
  
  else:
    le = preprocessing.LabelEncoder()
    le.fit(np.ndarray.tolist(feature))
  
  return le

def categorize(model, data):
  global one_hot_flag
  
  if one_hot_flag:
    return model.transform(data.reshape(-1, 1)).astype(np.float32)
  else:
    return model.transform(np.ndarray.tolist(data)).reshape(-1, 1)

def get_scaler(feature):
  le = preprocessing.StandardScaler()
  le.fit(feature.reshape(-1, 1))
  return le

def scale(model, data):
  return model.transform(data.reshape(-1, 1)).reshape(-1, 1)

categorizers = []
scalers = []

def make_data():
  global train, test, raw, categorizers, scalers

  categorizable = ['neighbourhood_group', 'neighbourhood', 'room_type']
  scalable = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365', 'price']

  train_cols = []
  test_cols = []

  # Saving categorizers and scalers

  for item in categorizable:
    categorizers.append(get_categorizer(raw[:, mapping[item]]))

  for item in scalable:
    scalers.append(get_scaler(train[:, mapping[item]]))

  # Building features

  for index, item in enumerate(categorizable):
    train_cols.append(categorize(categorizers[index], train[:, mapping[item]]))
    test_cols.append(categorize(categorizers[index], test[:, mapping[item]]))

  for index, item in enumerate(scalable):
    train_cols.append(scale(scalers[index], train[:, mapping[item]]))
    test_cols.append(scale(scalers[index], test[:, mapping[item]]))

  return train_cols, test_cols

In [6]:
# Defining data matrices
train_cols, test_cols = make_data()

train_mat = np.hstack(train_cols)
test_mat = np.hstack(test_cols)

X_train = train_mat[:, 0: -1]
Y_train = train_mat[:, -1]

X_test = test_mat[:, 0: -1]
Y_test = test_mat[:, -1]

In [7]:
possible_models = ['mlp', 'knn', 'lin', 'sgdreg', 'ridge', 'lasso', 'dtree', 'extree', 'rf', 'adaboost', 'bag', 'extrees', 'gradboost', 'histgradboost']

def get_regression_model(select = 'knn', train_src = np.zeros(1)):
  if select == 'mlp':
    val_frac = 0.2
    N_samples = train_src.shape[0]
    val_size = int(N_samples * val_frac)
    train_size = N_samples - val_size

    model = MLPRegressor(random_state = 2022,
                        max_iter = int(4 * 1e2),
                        learning_rate_init = 1e-2,
                        hidden_layer_sizes = (256, 512, 1024, 2048, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2),
                        batch_size = min(train_size, 8192),
                        activation = 'relu',
                        early_stopping = True,
                        validation_fraction = val_frac,
                        solver = 'adam',
                        verbose = False)

  elif select == 'knn':
    model = neighbors.KNeighborsRegressor(n_neighbors = 60, weights = 'distance')

  elif select == 'lin':
    model = linear_model.LinearRegression()

  elif select == 'sgdreg':
    model = linear_model.SGDRegressor(random_state = 2022)

  elif select == 'ridge':
    model = linear_model.Ridge(random_state = 2022)

  elif select == 'lasso':
    model = linear_model.Lasso(random_state = 2022)

  elif select == 'dtree':
    model = DecisionTreeRegressor(random_state = 2022)

  elif select == 'extree':
    model = ExtraTreeRegressor(random_state = 2022)

  elif select == 'rf':
    model = RandomForestRegressor(random_state = 2022)

  elif select == 'adaboost':
    model = AdaBoostRegressor(random_state = 2022)

  elif select == 'bag':
    model = BaggingRegressor(random_state = 2022)

  elif select == 'extrees':
    model = ExtraTreesRegressor(random_state = 2022)

  elif select == 'gradboost':
    model = GradientBoostingRegressor(random_state = 2022)

  elif select == 'histgradboost':
    model = HistGradientBoostingRegressor(random_state = 2022)

  return model

In [8]:
# Training
def train_model(model, train_src, train_tgt):
  model.fit(train_src, train_tgt)
  return model

# Viewing Results
def results(model, src, tgt, display = False):
  predictions = model.predict(src)
  rmse = metrics.mean_squared_error(tgt, predictions, squared = False)

  if display:
    print('Model Details: ', model)
    print('RMSE: ', rmse)
    print('R2:', model.score(src, tgt))
    print()

  return rmse

# Combined
def experiment(select = 'knn', folds = 5):
  global X_train, Y_train, X_test, Y_test

  kf = KFold(n_splits = folds, random_state = 2022, shuffle = True)
  
  rmse_list = []

  for train_index, val_index in kf.split(Y_train):
    train_src, val_src = X_train[train_index], X_train[val_index]
    train_tgt, val_tgt = Y_train[train_index], Y_train[val_index]

    model = get_regression_model(select, train_src)
    model = train_model(model, train_src, train_tgt)
    rmse_list.append(results(model, val_src, val_tgt))

  print('Model Code:', select)
  print('Mean RMSE from ' + str(folds) + '-fold cross validation: ' + str(np.mean(rmse_list)))
  print('Standard Deviation of RMSEs from ' + str(folds) + '-fold cross validation: ' + str(np.std(rmse_list, ddof = 1)))
  print('Results on held-out test set: ')

  model = get_regression_model(select, X_train)
  model = train_model(model, X_train, Y_train)
  results(model, X_test, Y_test, display = True)

In [9]:
run_for_codes = ['knn', 'lin', 'sgdreg', 'ridge', 'lasso', 'dtree', 'rf', 'bag', 'extrees', 'histgradboost']

if one_hot_flag == True:
  print('Using One-Hot encoding for categorical features.')
else:
  print('Using Label encoding for categorical features.')
print()

for code in run_for_codes:
  experiment(select = code)

Using One-Hot encoding for categorical features.

Model Code: knn
Mean RMSE from 5-fold cross validation: 0.9083392832528187
Standard Deviation of RMSEs from 5-fold cross validation: 0.14338917645424418
Results on held-out test set: 
Model Details:  KNeighborsRegressor(n_neighbors=60, weights='distance')
RMSE:  0.6778848471023335
R2: 0.26929851311157926

Model Code: lin
Mean RMSE from 5-fold cross validation: 3908759992.213685
Standard Deviation of RMSEs from 5-fold cross validation: 3938992190.0232973
Results on held-out test set: 
Model Details:  LinearRegression()
RMSE:  0.7091506407124613
R2: 0.20034043193805762

Model Code: sgdreg
Mean RMSE from 5-fold cross validation: 0.9269155457547138
Standard Deviation of RMSEs from 5-fold cross validation: 0.1464003095362948
Results on held-out test set: 
Model Details:  SGDRegressor(random_state=2022)
RMSE:  0.7113010786671684
R2: 0.19548328175315133

Model Code: ridge
Mean RMSE from 5-fold cross validation: 0.9256795342872772
Standard Devi

In [10]:
# Done