In [1]:
import pandas as pd
from random import random
import math

In [2]:
# Read in data and remove weird Unnamed column
df = pd.read_csv("wrangledNYCdata.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)
df.info()

FileNotFoundError: [Errno 2] No such file or directory: 'wrangledNYCdata.csv'

In [None]:
# Test/Train Split
length = len(df.index)
splitHere = int(length * 0.7)
train = df[0:splitHere]
test = df[splitHere:length]

In [None]:
# Training Algorithm:  
# Reminder --> Standard equation for 3 features: ax + by + cz + d = 0
def trainMRModel(trainDF, iterations):
  '''
  Pseudocode Algorithm:
  1. Naively choose starting weights for initial model. This is model A.
  2. Calculate the distance between data and model A.
  3. Randomly generate steps to take when generating model B from current model A.
    - if distance of model B is less than model A, replace model A with model B.
    - elif distance of model B is NOT less than model A after 50 tries of randomly stepping, keep model A.
  4. Repeat step 3 for desired number of iterations.
  '''
  
  # Initialize naive model A
  modelA = [0.5, 0.5, 0.5, 0.5] # [a, b, c, d]

  distanceA = calculateDistance(trainDF, modelA)
  print("First model - weights: ", modelA, " distance: ", distanceA)
  
  modelB = modelA[:] # make a copy of model A -> 'modelB = modelA' just references original model A
  
  # Randomly take steps for desired iterations
  for i in range(iterations):
    modelB = take_best_step(trainDF, modelB)
  
  # Calculate distance of best model found
  distanceB = calculateDistance(trainDF, modelB)
  print("Final model - weights: ", modelB, " distance: ", distanceB)

In [None]:
# Helper functions
def take_random_step(model):
  return [
      model[0] + (random() - 0.5), # Subtract off 0.5 to generate negative and positive steps
      model[1] + (random() - 0.5),
      model[2] + (random() - 0.5),
      model[3] + (random() - 0.5)
  ]

def take_best_step(trainDF, model):
  old_distance = calculateDistance(trainDF, model)
  for i in range(50): # try 50 times; if we fail to improve, return the old model
    new_model = take_random_step(model)
    new_distance = calculateDistance(trainDF, new_model)
    if new_distance < old_distance:
      return new_model
  return model # by default, return the old model

def calculateDistance(trainDF, model):
  sum = 0
  for i in range(len(trainDF.index[:10])):
    # Get data points
    bed = trainDF.iat[i, 1]
    bath = trainDF.iat[i, 2]
    sqft = trainDF.iat[i, 3]

    # Get coefficients
    a = model[0]
    b = model[1]
    c = model[2]
    d = model[3]

    # Calculate distance using formula: https://i.ytimg.com/vi/zWMTTRJ0l4w/maxresdefault.jpg
    numerator = abs((a * bed) + (b * bath) + (c * sqft) + d)
    denominator = math.sqrt((a * a) + (b * b) + (c * c ) + (d * d))
    distance = numerator / denominator
    sum = sum + distance
 
  return sum

In [None]:
trainMRModel(train, 1000)

First model - weights:  [0.5, 0.5, 0.5, 0.5]  distance:  8.77252747254
Final model - weights:  [-1.13446396842222, 2.2953306841560037, -0.2885874294868114, -0.19222099305053797]  distance:  0.42135591119927723
