In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pickle
import numpy as np
np.random.seed(42)

## Load in trained model and scaler

In [None]:
# use pickle.load to load in these assets
with open('../assets/lin_reg.pkl', 'rb') as f:
    lr = pickle.load(f)
with open ('../assets/scalar.pkl', 'rb') as f:
    ss = pickle.load(f)

# Load train data 

In [32]:
df = pd.read_csv('../datasets/train_df.csv')
df.head()

Unnamed: 0,MS SubClass,Lot Area,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,...,Paved Drive_P,Paved Drive_Y,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,60,13517,1976,2005,289.0,533.0,0.0,192.0,725.0,725,...,0,1,0,0,0,0,0,0,0,1
1,60,11492,1996,1997,132.0,637.0,0.0,276.0,913.0,913,...,0,1,0,0,0,0,0,0,0,1
2,20,7922,1953,2007,0.0,731.0,0.0,326.0,1057.0,1057,...,0,1,0,0,0,0,0,0,0,1
3,60,9802,2006,2007,0.0,0.0,0.0,384.0,384.0,744,...,0,1,0,0,0,0,0,0,0,1
4,50,14235,1900,1993,0.0,0.0,0.0,676.0,676.0,831,...,0,0,0,0,0,0,0,0,0,1


In [35]:
df["SalePrice"]

0       130500
1       220000
2       109000
3       174000
4       138500
         ...  
1950    298751
1951     82500
1952    177000
1953    144000
1954    189000
Name: SalePrice, Length: 1955, dtype: int64

In [37]:
X=df.drop(columns=["SalePrice"])

## Load kaggle data

In [38]:
kaggle_df = pd.read_csv('../datasets/test.csv', index_col='Id')
kaggle_df.head()

Unnamed: 0_level_0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,0,,,,0,4,2006,WD
2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,...,0,0,0,,,,0,8,2006,WD
2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,...,0,0,0,,,,0,9,2006,New
1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,...,0,0,0,,,,0,7,2007,WD
625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,...,0,185,0,,,,0,7,2009,WD


## Replace with numpy nan values

In [39]:
# replacing "nan" with np.nan
kaggle_df=kaggle_df.replace("nan", np.nan)

## Dropping duplicate rows

In [40]:
kaggle_df.drop_duplicates(keep="first", inplace=True)

## Dropping unwanted columns

In [41]:
kaggle_df.drop(columns=["PID"],inplace=True)

In [42]:
#drop unwanted columns
kaggle_df.drop(columns=["Street","Alley","Fireplace Qu","Misc Feature","Fence","Pool QC","Lot Frontage"],inplace=True)

## Filling missing values for columns with large number of missing values for numerical data

In [43]:
#imputing with mean
kaggle_df= kaggle_df.fillna(kaggle_df.median().iloc[0])

## Filling missing values for columns with large number of missing values for categorical data

In [44]:
kaggle_df= kaggle_df.fillna(kaggle_df.mode().iloc[0])

In [45]:
kaggle_df.isnull().sum().sum()

0

## Handling outliers

In [46]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

## Encode a couple of discrete numeric columns as object before dummying

In [47]:
kaggle_df[['Overall Cond', 'Overall Qual']] =kaggle_df[['Overall Cond', 'Overall Qual']].astype('object')

In [48]:
kaggle_df = pd.get_dummies(kaggle_df, drop_first = True)

In [49]:
kaggle_df.shape

(878, 240)

## Manually drop the collinear features

## Use sklearn feature polynomial

In [None]:
# X = kaggle_df

In [None]:
# features=X.columns.tolist()

In [None]:
# poly = PolynomialFeatures(include_bias=False)

In [None]:
# X_poly = poly.fit_transform(X)

In [None]:
# pd.DataFrame(X_poly, columns=poly.get_feature_names(features))

# Compare kaggle and train

In [50]:
# Let's compare the number of features in our kaggle dataset vs. our training dataset

print(f'We have {len(X.columns)} features in our training dataset')

print(f'We have {len(kaggle_df.columns)} features in our kaggle dataset')

We have 255 features in our training dataset
We have 240 features in our kaggle dataset


In [51]:
## STEP 1: If there are features in our training dataset that are NOT in the kaggle dataset, we will have to ADD those features to our kaggle dataset and set them equal to 0

# I am grabbing the set of our train features
# I am grabbing the set of our kaggle features
# I am differencing the sets, and seeing what's leftover from my training features

features_to_add = set(X.columns).difference(set(kaggle_df.columns))
print(features_to_add)


{'Heating_Wall', 'Garage Qual_Ex', 'Condition 2_RRAe', 'Heating QC_Po', 'Heating_OthW', 'Neighborhood_Landmrk', 'Electrical_Mix', 'Utilities_NoSeWa', 'Roof Matl_CompShg', 'Condition 2_RRNn', 'Bsmt Cond_Po', 'Neighborhood_GrnHill', 'Functional_Sal', 'Functional_Sev', 'Condition 2_PosN', 'Roof Matl_Membran', 'MS Zoning_C (all)', 'Exterior 1st_CBlock', 'Overall Qual_2', 'Exterior 1st_Stone', 'Condition 2_Feedr', 'Exterior 1st_ImStucc', 'Bsmt Cond_Ex', 'Exterior 2nd_Stone', 'Condition 2_RRAn'}


In [52]:

# let's add these features to our kaggle dataframe and set them all equal to 0!

for feature in features_to_add:
    kaggle_df[feature] = 0


In [53]:
# We should now have an empty set
set(X.columns).difference(set(kaggle_df.columns))

print(f'We have {len(X.columns)} features in our training dataset')

print(f'We have {len(kaggle_df.columns)} features in our kaggle dataset')

We have 255 features in our training dataset
We have 265 features in our kaggle dataset


In [54]:
## STEP 2: If there are features in our kaggle dataset that are NOT in the training dataset, we will have to DROP those features in our kaggle dataset.

# We are doing the same thing above, but switching it up!

features_to_delete = list(set(kaggle_df.columns).difference(set(X.columns)))
print(features_to_delete)

['Exterior 2nd_Other', 'Electrical_FuseA', 'Mas Vnr Type_CBlock', 'Kitchen Qual_Po', 'Sale Type_VWD', 'Exterior 2nd_PreCast', 'Roof Matl_Roll', 'Roof Matl_Metal', 'Heating_GasA', 'Exterior 1st_PreCast']


In [55]:
# We are going to drop these columns!

kaggle_df.drop(columns = features_to_delete, inplace=True)

print(f'We have {len(X.columns)} features in our training dataset')
print(f'We have {len(kaggle_df.columns)} features in our kaggle dataset')
print('yay')


We have 255 features in our training dataset
We have 255 features in our kaggle dataset
yay


In [56]:
## STEP 3: Re-order your kaggle dataset features!

# sanity check:
all(X.columns) == all(kaggle_df.columns)

True

In [57]:
# making sure the columns are in order!
kaggle_df = kaggle_df.loc[:, X.columns]

In [58]:
# Fill NA's with 0

kaggle_df = kaggle_df.fillna(0)

In [59]:
# Make prediction

# Get sales predictions!
kaggle_preds = lr.predict(kaggle_df)


NameError: name 'lr' is not defined

In [None]:
# The .reshape(-1,1) method changes a numpy array into a numpy matrix with 1 column
# pred_reversed = pt_y.inverse_transform(pred.reshape(-1,1))

def mlr_residual_visualizer(y_true, y_hat):
    plt.figure(figsize = (20,10))
    plt.scatter(y_hat,y_true, c = '#7D1B7E');
    plt.plot((0,np.max(y_true)),(0,np.max(y_true)), c = 'red'); # identity line
    plt.xlabel("Predicted Values: $\hat{y}$", fontsize = 20);
    plt.xticks(fontsize = 40)
    plt.ylabel("Actual Values: $y$", fontsize = 20);
    plt.title("$\hat{y}$ vs. $y$", fontsize = 40);
    plt.legend(['Actual']);
    plt.yticks(fontsize=15);
    plt.xticks(fontsize=15);

In [None]:
# create a submission dataframe:
kaggle_df['SalePrice'] = kaggle_preds

submission = pd.DataFrame(kaggle_df['SalePrice'], index = kaggle_df.index)
submission.sort_index(inplace=True)
submission.head()

In [None]:
# Save your csv!
submission.to_csv('./datasets/trash_submission.csv')

Then click this link to submit:¶
https://www.kaggle.com/c/dsi-us-11-project-2-regression-challenge/submit