In [13]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from sklearn.tree import DecisionTreeRegressor
import pickle
import boto3

In [14]:
##HELPER FUNCTIONS##

#read data file
def readCSV():
    data = pd.read_csv('housing_data.csv')
    return data

#clean data (code copied from helperclean file)
def cleanData(data):
    data.drop(['Listing Number', 'Street Number', 'Street Number Modifier', 'Street Direction', 'Street Name', 'Street Suffix', 'Street Post Direction', 'City', 'State', 'Area', 'Selling Date', 'Style Code', 'Sold to List Price Percentage'], axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(subset=["Bathrooms", "Bedrooms", "Listing Price"], how="all", inplace=True)
    data.dropna(how='any', inplace=True)
    data = data.fillna(0)
    data[["Bathrooms", "Bedrooms", "Listing Price", "Selling Price"]] = data[["Bedrooms", "Bathrooms", "Listing Price", "Selling Price"]].apply(pd.to_numeric)
    data = data.astype({"Bathrooms": int, "Bedrooms": int, "Listing Price": int, "Selling Price": int})
    PropertyType = {'House': 1, 'Condo': 2, 'Townhouse': 3,}
    data.replace({"Property Type": PropertyType}, regex=True, inplace=True)
    data = data.astype({"Property Type": int})
    data['Listing Price'] = data['Listing Price'].round(decimals=2)
    data.loc[(data['Bedrooms'] == 0)]
    data.loc[(data['Bathrooms'] == 0)]
    data.sort_values(by=['Listing Price'],ascending=False)
    data.loc[(data['Listing Price'] > 0)].sort_values(by=['Listing Price'])
    data.sort_values(by=['Square Footage'],ascending=False)
    missing = data.loc[(data['Listing Price'] == 0)].append(data.loc[(data['Bathrooms'] == 0)])
    missing_index_list = missing.reset_index()['index'].to_list()
    missing_index_list.sort(reverse = True)
    data = data.drop(missing_index_list,axis=0)

    return data

#train DTmodel with housing data
def trainModel(data):

    y = data['Selling Price']
    X = data.drop(['Selling Price'],axis=1)


    dt = DecisionTreeRegressor(max_depth=18)
    dt.fit(X, y)
    
    return dt

#creates a saved version of trained model
def saveModel(dt):
    with open('OfferAidmodeltest.pkl','wb') as f:
        pickle.dump(dt,f) 

#OPTIONAL: save model directly to s3 bucket, replace 'bucket name' below with the bucket you would like the file to be saved in,
# add aws access keys to .env file, 'offeraidmodel' is the bucket name used in this project
def savetoS3():
    load_dotenv()

    s3client = boto3.client('s3', 
                            aws_access_key_id = os.getenv('ACCESS_KEY'), 
                            aws_secret_access_key = os.getenv('SECRET_KEY'), 
                        )

    with open('OfferAidmodeltest.pkl','rb') as f:
        s3client.upload_fileobj(f, 'yourbuckethere', 'OfferAidmodel.pkl')

#runs required functions for training model, uncomment the save to s3 function to save file directly to s3 (this requires adding access keys to .env variables)
def execute():
    data = readCSV()
    data = cleanData(data)
    dtpred = trainModel(data)
    saveModel(dtpred)
    savetoS3()

In [15]:
execute()