### 1. Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

### 2. Read Data

In [2]:
train_df = pd.read_csv('data/train.csv', dtype={13:str,39:str,40:str,41:str})
test_df = pd.read_csv('data/test.csv', dtype={13:str,39:str,40:str,41:str})
test_actual_prices = pd.read_csv('data/test_soln.csv').SalePrice

### 3. Get Average Sale Price By Model

For example, there were 32 pieces of equiptment sold with the model id of 28. The sum of their sale prices was nearly \$500,000, or about $15,600 each.

In [3]:
train_df['Counts'] = 1
avg_prices_df = train_df[['ModelID','SalePrice','Counts']].groupby('ModelID').sum()
avg_prices_df['AvgPrice'] = (avg_prices_df.SalePrice / avg_prices_df.Counts).round(2)
avg_prices_df.head()

Unnamed: 0_level_0,SalePrice,Counts,AvgPrice
ModelID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
28,498250,32,15570.31
29,152500,9,16944.44
31,137500,8,17187.5
34,103500,6,17250.0
43,13830400,697,19842.75


### 4. Calculate The Average Sale Price

We need to know the average sale price to fill in where there is a previously unseen model.

In [4]:
avg_sale_price = train_df.SalePrice.sum()/len(train_df)
print("Average sale price: ${:.2f}".format(avg_sale_price))

$31099.71


### 5. Create A Function That Predicts Sale Price By Simple Lookup

If that id appears in the average_prices dataframe, return the average price for tha model. Otherwise return the overall average sale price.

In [5]:
def predict_price(id):
    try: return avg_prices_df.loc[id]['AvgPrice']
    except: return avg_sale_price

### 6. Use The Function To Predict Prices in the Test Data

In [6]:
# The log error is more apropriate in this case because of 
# the wide variation in prices ($4750 to 142000).

def score(predictions, actual):
    log_diff = np.log(predictions+1) - np.log(actual+1)
    return np.sqrt(np.mean(log_diff**2))

In [7]:
# To make predictions we apply the function predict_price to each row in the test dataframe.
score_gtap = score(test_df.ModelID.apply(predict_price), test_actual_prices)
print("The guess-the-average-price model has a RMLSE of {:.4}.".format(score_gtap))

The guess-the-average-price model has a RMLSE of 0.3209.


### 7. Add The "Average Price By Model ID" As A Feature

In [8]:
train_df["avg_price_for_model"] = train_df.ModelID.apply(predict_price)
test_df["avg_price_for_model"] = test_df.ModelID.apply(predict_price)

### 8. Fit a Linear Model

Used only the average price per model and a constant term. The model, unsurprisingly, relies entirely on the average price per model.

In [12]:
train_df['const'] = 1
test_df['const'] = 1

X_train = train_df[["avg_price_for_model","const"]]
X_test = test_df[["avg_price_for_model","const"]]

y_train = train_df.SalePrice

linear_model = LinearRegression()
linear_model.fit(X_train,y_train)
y_predict = linear_model.predict(X_test)

score_famtapaf = score(y_predict, test_actual_prices)
print("The fit-a-model-to-average-price-as-feature model has a RMLSE of {:.4}.".format(score_famtapaf))

linear_model.coef_;

The fit-a-model-to-average-price-as-feature model has a RMLSE of 0.3209.
