In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import KFold, train_test_split

%matplotlib inline

In [3]:
train_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_train.csv', index_col=0)
test1_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test1.csv', index_col=0)
test2_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test2.csv', index_col=0)
test3_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test3.csv', index_col=0)

#### Training Data

In [7]:
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 'size_sqft', 'no_fee', 'addr_zip', 'floornumber', 'floor_count', 
    'has_doorman', 'has_fireplace', 'has_gym', 'allows_pets', 'has_washer_dryer', 'has_garage',
    'has_roofdeck', 'has_concierge', 'has_pool', 'has_garden'

]

train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

#### Handle Training Data Missing Values

In [8]:
def random_imputation(df, feature):

    number_missing = df[feature].isnull().sum()
    observed_values = df.loc[df[feature].notnull(), feature]
    df.loc[df[feature].isnull(), feature + '_imp'] = np.random.choice(observed_values, number_missing, replace = True)    
    return df

#missing_columns = ["year_built", "min_to_subway", "floornumber"]
#for feature in missing_columns:
#    test1_features[feature + '_imp'] = [feature]
#    test1_features = random_imputation(test1_features, feature)

test1_features = test1_df[feature_cols] 
test1_features = test1_features.fillna(train_features.median(), axis=0)

#  <font color=green> Test 1 <font>

#### Linear Regression Model

In [9]:
lreg = LinearRegression()
lreg.fit(train_features, train_target)

test1_df['predicted'] = lreg.predict(test1_features)
mean_squared_error(test1_df['rent'], test1_df['predicted'])
#test_df['predicted'].to_csv('linear.csv', header=True)
#3301338.418329301

3297096.0901556876

#### Random Forest Regression Model

In [15]:
regressor = RandomForestRegressor(n_estimators = 50, random_state = 1, max_depth=42)
regressor.fit(train_features, train_target)

test1_df['predicted'] = regressor.predict(test1_features)
mean_squared_error(test1_df['rent'], test1_df['predicted'])
#test_df['predicted'].to_csv('linear.csv', header=True)
#3301338.418329301

1746570.3028764988

#### Combine Data, Predict Values for Test 2

In [18]:
master_df = train_df.append(test1_df, sort=False)

master_features = master_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
master_target = master_df['rent']

regressor.fit(master_features, master_target)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=42,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

#  <font color=green> Test 2 <font>


#### Handle Training Data Missing Values

In [19]:
test2_features = test1_df[feature_cols] 
test2_features = test1_features.fillna(train_features.median(), axis=0)

#### Random Forest Regression Model

In [20]:
test2_df['predicted'] = regressor.predict(test2_features)

In [21]:
test2_df['predicted'].to_csv('test2_results.csv', header=True)