In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from IPython.display import display   #From stack overflow link below

#:https://stackoverflow.com/questions/51288869/print-visually-pleasing-dataframes-in-for-loop-in-jupyter-notebook-pandas

In [2]:
test = pd.read_csv('./datasets/test.csv')

# <span style = 'color: blue' > Cleaning </span>

**Viewing the DataFrame**

# Reformatting Columns and Column Names#

**Creating `test_cols` Variable with all the Dataframe Columns:**

In [3]:
test_cols = test.columns

**Making the headers all lower_case**


In [4]:
test.columns = [cols.lower() for cols in test.columns]

**Replace spaces with underscores**


In [5]:
test.columns = [cols.lower().replace(" ", "_") for cols in test.columns]

**Changing the name of `totrms_abv_grd` to `tot_rooms_abv_grd`**


In [6]:
test.rename(columns = {'totrms_abvgrd': 'tot_rooms_abvgrd'}, inplace = True)

### Creating Consistency in Column Names

**Changes `area` to `sf`:**

In [7]:
test.columns = [cols.replace('area', 'sf') for cols in test.columns]

**Changes `qc` to `qu`:**

In [8]:
test.columns = [cols.replace('qc', 'qu') for cols in test.columns]

**Changes `qual` to `qu`:**

In [9]:
test.columns = [cols.replace('qual', 'qu') for cols in test.columns]

**Changes `year` to `yr`:**

In [10]:
test.columns = [cols.replace('year', 'yr') for cols in test.columns]

**Adding an underscore after `bsmt` when there wasn't one:**

In [11]:
test.columns = [cols.replace("bsmt", "bsmt_") for cols in test.columns]

**Making sure there's not double underscores when there was already an underscore from fixes made in the above cell:**

In [12]:
test.columns = [cols.replace("__", "_") for cols in test.columns]

**Updating the `test_cols` variable:**

In [13]:
test_cols = test.columns

**Removing Houses With Square Footage over 4,000 as Recommended in the Data Dictionary:**

In [14]:
test = test[test['gr_liv_sf'] < 4000]
test.shape

(876, 80)

**Assigning Numerical Value to Ordinal Values in Catergorical Columns:**

Excellent(Ex) : 5 

Average/Average (TA) : 4 

Good (Gd) : 3 

Fair (Fa) : 2

Poor (Po) : 1

In [15]:
quality_numbers = { 'Ex' : 5 , 'TA' : 4 , 'Gd' : 3 , 'Fa': 2 , 'Po': 1}
test.replace ({
    'heating_qu': quality_numbers, 
    'pool_qu': quality_numbers, 
    'garage_cond': quality_numbers, 
    'garage_qu': quality_numbers, 
    'fireplacequ': quality_numbers, 
    'kitchen_qu': quality_numbers, 
    'bsmt_cond' : quality_numbers, 
    'bsmt_qu': quality_numbers, 
    'exter_cond': quality_numbers, 
    'exter_qu' : quality_numbers
}, inplace=True)

**Creating a Numerical Columns Variable Called `num_cols`:**

In [16]:
num_cols = test._get_numeric_data()

**Creating a Catergorial Columns List Called `cat_cols`:**

In [17]:
#Code adapted from GeeksforGeeks.org          https://www.geeksforgeeks.org/python-difference-two-lists/

def get_difference(lst_1, lst_2): 
    dif = (list(set(lst_1) - set(lst_2)))  #Subtracting the numerical cols from all the cols using sets
    return dif
                 # cat_cols

In [18]:
cat_cols = get_difference(test_cols, num_cols)

**Creating a Variable with Columns that Seem to Have a High Correlation to Sale Price called `sig_cols`:**

In [20]:
# sig_cols = test[['overall_qual', 'gr_liv_area', 'total_bsmt_sf', 'garage_area', 'garage_cars', '1st_flr_sf', 'year_built', 'full_bath', 'tot_rooms_abvgrd']]

In [21]:
# sig_cols_w_nulls = ['garage_cars', 'garage_sf', 'total_bsmt_sf']

In [32]:
sig_cols = ['overall_qu', 'gr_liv_sf', '1st_flr_sf', 'yr_built', 'full_bath', 'tot_rooms_abvgrd']

In [33]:
model_from_pickling = pickle.load(open('kag_sub', 'rb'))

In [34]:
X = test[sig_cols]

In [36]:
preds = model_from_pickling.predict(X)

In [38]:
index = test['id']

In [40]:
submission = pd.DataFrame()
submission['SalePrice'] = np.round(preds)
submission['Id'] = index
submission.set_index ('Id', inplace = True)
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658.0,152632.0
2718.0,212583.0
2414.0,190268.0
1989.0,97345.0
625.0,185373.0


In [42]:
submission.shape

(876, 1)