## This quick notebook is just to show some minor pitfalls you might encounter when trying to generate kaggle submissions for Project 2

In [1]:
# imports 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
# Reading in data for both train.csv and test.csv
df = pd.read_csv('datasets/train.csv')
kaggle_sub_data = pd.read_csv('datasets/test.csv')

In [3]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [4]:
# Changing all column names to be lower case and remove spaces for _
df.columns = [column.replace(' ','_').lower() for column in df.columns]

### Model 1

In [5]:
# These features are chosen only because they did not need any EDA. This model is not meant to be good.
features = ['lot_area', 'fireplaces', '1st_flr_sf', 'full_bath', 'half_bath', 'totrms_abvgrd']
X = df[features]
y = df['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [6]:
# Building and checking our model out
lr_1 = LinearRegression()
lr_1.fit(X_train, y_train)
lr_1.score(X_train, y_train), lr_1.score(X_test, y_test)

(0.586908421365044, 0.6437361531661305)

In [7]:
# Generating predictions on the kaggle data
# The predictions need to be done on the same features that we modeled on.
kaggle_preds = lr_1.predict(kaggle_sub_data[features])

KeyError: "None of [Index(['lot_area', 'fireplaces', '1st_flr_sf', 'full_bath', 'half_bath',\n       'totrms_abvgrd'],\n      dtype='object')] are in the [columns]"

Because our column names are now different. Any changes we do on our modeled features we need to do on the data from the test.csv

Here we had simply changed the case of the names and replaced spaces with _ but even superficial changes like that matter.

In [8]:
# Converting kaggle data columns to be the same as the data that we trained on.
kaggle_sub_data.columns = [column.replace(' ', '_').lower() for column in kaggle_sub_data.columns]

In [9]:
# Creating a variable to hold all our predictions
kaggle_preds = lr_1.predict(kaggle_sub_data[features])

In [10]:
# Creating a new column that is the saleprice. These are the values from the .predict we just ran.
kaggle_sub_data['saleprice'] = kaggle_preds

In [11]:
# Checking the data
kaggle_sub_data[['id','saleprice']]

Unnamed: 0,id,saleprice
0,2658,159449.713528
1,2718,251304.668573
2,2414,190992.300403
3,1989,110894.990037
4,625,229389.162867
...,...,...
873,1662,228869.864642
874,1234,234371.439112
875,1373,152430.717214
876,1672,102242.553569


In [12]:
# Exporting our data as a csv
# Setting index=False drops the index column which kaggle does not want for a submission
kaggle_sub_data[['id','saleprice']].to_csv('kaggle_sub_1.csv', index=False)

# Special Quiz 2 Note!

In [14]:
pd.DataFrame(list(zip(features, lr_1.coef_)), columns=['Feature', 'Coefficient']).sort_values(by='Coefficient', ascending=False)

Unnamed: 0,Feature,Coefficient
3,full_bath,39926.533364
4,half_bath,36934.545233
1,fireplaces,21263.791536
5,totrms_abvgrd,3399.40431
2,1st_flr_sf,83.49561
0,lot_area,0.064806


For every one unit increase in the number of full baths we coudl expect a $39926 increase in sale price holding all else constant

----

### Model 2

In [13]:
# Reimporting test.csv just to have a clean slate
kaggle_sub_data = pd.read_csv('datasets/test.csv')
kaggle_sub_data.columns = [column.replace(' ', '_').lower() for column in kaggle_sub_data.columns]

In [14]:
# Same features and split
features = ['lot_area', 'fireplaces', '1st_flr_sf', 'full_bath', 'half_bath', 'totrms_abvgrd']
X = df[features]
y = df['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [15]:
# Instantiating a PolynomialFeatures transformer
poly = PolynomialFeatures(include_bias=False)

# Fit the Polynomial Transformation
poly.fit(X_train)

# Transform the data
X_train_pf = poly.transform(X_train)
X_test_pf = poly.transform(X_test)

In [16]:
# Let's check our features
pd.DataFrame(X_train_pf, columns=poly.get_feature_names(X.columns))

Unnamed: 0,lot_area,fireplaces,1st_flr_sf,full_bath,half_bath,totrms_abvgrd,lot_area^2,lot_area fireplaces,lot_area 1st_flr_sf,lot_area full_bath,...,1st_flr_sf^2,1st_flr_sf full_bath,1st_flr_sf half_bath,1st_flr_sf totrms_abvgrd,full_bath^2,full_bath half_bath,full_bath totrms_abvgrd,half_bath^2,half_bath totrms_abvgrd,totrms_abvgrd^2
0,10667.0,1.0,1587.0,2.0,0.0,7.0,113784889.0,10667.0,16928529.0,21334.0,...,2518569.0,3174.0,0.0,11109.0,4.0,0.0,14.0,0.0,0.0,49.0
1,12888.0,2.0,1262.0,1.0,1.0,7.0,166100544.0,25776.0,16264656.0,12888.0,...,1592644.0,1262.0,1262.0,8834.0,1.0,1.0,7.0,1.0,7.0,49.0
2,7200.0,0.0,864.0,1.0,0.0,5.0,51840000.0,0.0,6220800.0,7200.0,...,746496.0,864.0,0.0,4320.0,1.0,0.0,5.0,0.0,0.0,25.0
3,14000.0,0.0,1306.0,2.0,1.0,7.0,196000000.0,0.0,18284000.0,28000.0,...,1705636.0,2612.0,1306.0,9142.0,4.0,2.0,14.0,1.0,7.0,49.0
4,11929.0,1.0,1251.0,2.0,1.0,9.0,142301041.0,11929.0,14923179.0,23858.0,...,1565001.0,2502.0,1251.0,11259.0,4.0,2.0,18.0,1.0,9.0,81.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1533,9709.0,2.0,958.0,2.0,1.0,8.0,94264681.0,19418.0,9301222.0,19418.0,...,917764.0,1916.0,958.0,7664.0,4.0,2.0,16.0,1.0,8.0,64.0
1534,9000.0,1.0,1196.0,1.0,0.0,6.0,81000000.0,9000.0,10764000.0,9000.0,...,1430416.0,1196.0,0.0,7176.0,1.0,0.0,6.0,0.0,0.0,36.0
1535,10140.0,1.0,1309.0,1.0,1.0,5.0,102819600.0,10140.0,13273260.0,10140.0,...,1713481.0,1309.0,1309.0,6545.0,1.0,1.0,5.0,1.0,5.0,25.0
1536,1869.0,0.0,483.0,1.0,1.0,5.0,3493161.0,0.0,902727.0,1869.0,...,233289.0,483.0,483.0,2415.0,1.0,1.0,5.0,1.0,5.0,25.0


In [17]:
# Model the data
lr_2 = LinearRegression()
lr_2.fit(X_train_pf, y_train)

# Checking Scores
lr_2.score(X_train_pf, y_train), lr_2.score(X_test_pf, y_test)

(0.7005281270500114, 0.47574345475712587)

In [18]:
# Generate Kaggle Submissions
kaggle_preds = lr_2.predict(kaggle_sub_data[features])

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 27 is different from 6)

This doesn't work because we now have a larger number of columns we modeled on. We need to match the features between the train.csv and the test.csv

We can do that by transforming our features in the test.csv to match what we fit our model on.

In [19]:
# Using our transformer from above
kaggle_poly = poly.transform(kaggle_sub_data[features])

In [20]:
# Making predictions
kaggle_preds = lr_2.predict(kaggle_poly)

In [21]:
# Assigning to dataframe
kaggle_sub_data['saleprice'] = kaggle_preds

In [22]:
# Exporting 
kaggle_sub_data[['id','saleprice']].to_csv('kaggle_sub_2.csv', index=False)