In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pickle
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
np.random.seed(42)

## Load in trained model and scaler

In [2]:
# use pickle.load to load in these assets
with open('../assets/lin_reg.pkl', 'rb') as f:
    lr = pickle.load(f)


In [3]:
# use pickle.load to load in these assets
with open('../assets/lin_reg_im.pkl', 'rb') as f:
    lr_im = pickle.load(f)


In [4]:
# use pickle.load to load in these assets
with open('../assets/lin_reg_im2.pkl', 'rb') as f:
    lr_ridge_im = pickle.load(f)

In [5]:
with open ('../assets/scaler.pkl', 'rb') as f:
      ss = pickle.load(f)

In [6]:
with open ('../assets/transformer_y.pkl', 'rb') as f:
      pt_y = pickle.load(f)

# Load train data 

In [7]:
#importing data
X=pd.read_csv('../datasets/X_train.csv',index_col="Id")

## Load kaggle data

In [8]:
kaggle_df = pd.read_csv('../datasets/test.csv', index_col='Id')
kaggle_df.head()

Unnamed: 0_level_0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,0,,,,0,4,2006,WD
2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,...,0,0,0,,,,0,8,2006,WD
2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,...,0,0,0,,,,0,9,2006,New
1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,...,0,0,0,,,,0,7,2007,WD
625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,...,0,185,0,,,,0,7,2009,WD


## Replace with numpy nan values

In [9]:
# replacing "nan" with np.nan
kaggle_df=kaggle_df.replace("nan", np.nan)

## Dropping duplicate rows

In [10]:
#kaggle_df.drop_duplicates(keep="first", inplace=True)

## Dropping unwanted columns

In [11]:
kaggle_df.drop(columns=["PID"],inplace=True)

In [12]:
#drop unwanted columns
kaggle_df.drop(columns=["Street","Alley","Fireplace Qu","Misc Feature","Fence","Pool QC","Lot Frontage"],inplace=True)

## Filling missing values for columns with large number of missing values for numerical data

In [13]:
#imputing with mean
kaggle_df= kaggle_df.fillna(kaggle_df.median().iloc[0])

## Filling missing values for columns with large number of missing values for categorical data

In [14]:
kaggle_df= kaggle_df.fillna(kaggle_df.mode().iloc[0])

In [15]:
kaggle_df.isnull().sum().sum()

0

## Encode a couple of discrete numeric columns as object before dummying

In [16]:
kaggle_df[['Overall Cond', 'Overall Qual']] =kaggle_df[['Overall Cond', 'Overall Qual']].astype('object')

In [17]:
kaggle_df.drop(columns=["Misc Val","Bsmt Half Bath","Yr Sold"],inplace=True)

In [18]:
X1=kaggle_df
X1=X1.select_dtypes(exclude=["object"])

In [19]:
# use pickle.load to load in these assets
with open('../assets/thresh.pkl', 'rb') as f:
    threshold = pickle.load(f)

In [20]:
X_t= threshold.fit_transform(X1)

In [21]:
columns = X1.columns[threshold.get_support()]

In [22]:
X1=pd.DataFrame(X_t, columns=columns)

In [23]:
features=X1.columns.tolist()

In [24]:
# use pickle.load to load in these assets
with open('../assets/poly.pkl', 'rb') as f:
    poly = pickle.load(f)

In [25]:
X_poly = poly.fit_transform(X1)

In [26]:
df1=pd.DataFrame(X_poly, columns=poly.get_feature_names(features))

In [27]:
df2=kaggle_df.select_dtypes(include=["object"])
df2 = df2.reset_index(drop = True)

In [28]:
df3=pd.concat([df2,df1],axis=1)

In [29]:
kaggle_df=kaggle_df.reset_index(drop=False)

In [30]:
df3["Id"] = kaggle_df["Id"] 

In [31]:
df3=df3.set_index("Id")

In [32]:
kaggle_df=df3

In [33]:
kaggle_df = pd.get_dummies(kaggle_df, drop_first = True)

In [34]:
kaggle_df.shape

(878, 483)

# Compare kaggle and train

In [35]:
# Let's compare the number of features in our kaggle dataset vs. our training dataset

print(f'We have {len(X.columns)} features in our training dataset')

print(f'We have {len(kaggle_df.columns)} features in our kaggle dataset')

We have 547 features in our training dataset
We have 483 features in our kaggle dataset


In [36]:
## STEP 1: If there are features in our training dataset that are NOT in the kaggle dataset, we will have to ADD those features to our kaggle dataset and set them equal to 0

# I am grabbing the set of our train features
# I am grabbing the set of our kaggle features
# I am differencing the sets, and seeing what's leftover from my training features

features_to_add = set(X.columns).difference(set(kaggle_df.columns))
print(features_to_add)


{'Gr Liv Area Garage Cars', 'Bsmt Full Bath TotRms AbvGrd', 'Garage Cars 3Ssn Porch', 'Condition 2_RRNn', 'Year Remod/Add Garage Cars', '2nd Flr SF Garage Cars', 'Roof Matl_CompShg', 'Functional_Sal', 'BsmtFin SF 1 Garage Cars', 'Bsmt Full Bath Enclosed Porch', 'Bsmt Unf SF Garage Cars', 'Mas Vnr Area Bsmt Full Bath', 'Electrical_Mix', 'Bsmt Full Bath Mo Sold', 'MS Zoning_C (all)', 'MS SubClass Bsmt Full Bath', 'Garage Cars Open Porch SF', 'Year Remod/Add Bsmt Full Bath', 'Lot Area Garage Cars', '1st Flr SF Garage Cars', 'Gr Liv Area Bsmt Full Bath', 'Garage Qual_Ex', 'Condition 2_RRAn', 'Garage Cars Wood Deck SF', 'Garage Cars', 'Bsmt Unf SF Bsmt Full Bath', 'Exterior 1st_Stone', 'Bsmt Full Bath Wood Deck SF', 'Garage Yr Blt Garage Cars', 'Bsmt Cond_Po', 'Lot Area Bsmt Full Bath', 'Bsmt Full Bath 3Ssn Porch', 'Roof Matl_Membran', '1st Flr SF Bsmt Full Bath', 'MS SubClass Garage Cars', 'Heating_Wall', 'Condition 2_PosN', 'Garage Cars Garage Area', 'Heating QC_Po', 'Garage Cars Enclosed

In [37]:

# let's add these features to our kaggle dataframe and set them all equal to 0!

for feature in features_to_add:
    kaggle_df[feature] = 0


In [38]:
# We should now have an empty set
set(X.columns).difference(set(kaggle_df.columns))

print(f'We have {len(X.columns)} features in our training dataset')

print(f'We have {len(kaggle_df.columns)} features in our kaggle dataset')

We have 547 features in our training dataset
We have 557 features in our kaggle dataset


In [39]:
## STEP 2: If there are features in our kaggle dataset that are NOT in the training dataset, we will have to DROP those features in our kaggle dataset.

# We are doing the same thing above, but switching it up!

features_to_delete = list(set(kaggle_df.columns).difference(set(X.columns)))
print(features_to_delete)

['Sale Type_VWD', 'Electrical_FuseA', 'Exterior 1st_PreCast', 'Kitchen Qual_Po', 'Roof Matl_Metal', 'Roof Matl_Roll', 'Exterior 2nd_Other', 'Exterior 2nd_PreCast', 'Mas Vnr Type_CBlock', 'Heating_GasA']


In [40]:
# We are going to drop these columns!

kaggle_df.drop(columns = features_to_delete, inplace=True)

print(f'We have {len(X.columns)} features in our training dataset')
print(f'We have {len(kaggle_df.columns)} features in our kaggle dataset')
print('yay')


We have 547 features in our training dataset
We have 547 features in our kaggle dataset
yay


In [41]:
## STEP 3: Re-order your kaggle dataset features!

# sanity check:
all(X.columns) == all(kaggle_df.columns)

True

In [42]:
# making sure the columns are in order!
kaggle_df = kaggle_df.loc[:, X.columns]

## Scale the kaggle data

In [43]:
kaggle_sc = ss.transform(kaggle_df)

In [44]:
kaggle_sc.shape

(878, 547)

## Model Prep: `logTransformer`

In [45]:
# kaggle_tr=pt.transform(kaggle_sc)

In [46]:
#kaggle_tr=kaggle_sc

# Make prediction

In [47]:

# Get sales predictions!
kaggle_preds = lr.predict(kaggle_sc)


In [48]:
#transform the prediction
# The .reshape(-1,1) method changes a numpy array into a numpy matrix with 1 column

kaggle_preds= pt_y.inverse_transform(kaggle_preds.reshape(-1,1))

In [49]:
# create a submission dataframe:
kaggle_df['SalePrice'] = kaggle_preds

submission = pd.DataFrame(kaggle_df['SalePrice'], index = kaggle_df.index)
submission.sort_index(inplace=True)
submission

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2,113048.444712
4,226232.093308
6,186540.402784
7,204590.133736
17,218331.750651
...,...
2919,84890.303974
2921,88902.318526
2922,160890.873560
2925,152243.548339


In [50]:
# Save your csv!
submission.to_csv('../datasets/submission.csv')

Then click this link to submit:¶
https://www.kaggle.com/c/dsi-us-11-project-2-regression-challenge/submit