## Data Exploration and Understanding correlations
This script primarily explores RECS data to understand
1. distinction in data features (categorical, numerical, possible predictors and outputs)
2. Understand correlation between features
3. Any missing / duplicate values
4. Data requiring one-hot encoding


In [14]:
# IMport dependencies
import pandas as pd
from pandas import set_option
import numpy
import os
import csv

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# set global variables
dataFilePath = "dataforfinalproject"
filename = "RECS_COMBINED_DATA.csv"
cols_file = "Final_Columns_withCat.csv"

In [5]:
# read dataset and cols into DF
df_recs = pd.read_csv(os.path.join(dataFilePath, filename), low_memory= False)
df_cols = pd.read_csv(os.path.join(dataFilePath, cols_file))

In [16]:
# Start Data Exploration
print(f"shape of the dataset df_recs is {df_recs.shape}")

# print the dtypes
set_option("precision", 1)
df_recs.head()

shape of the dataset df_recs is (26973, 199)


Unnamed: 0,RECSYEAR,DOEID,REGIONC,DIVISION,TYPEHUQ,CELLAR,STORIES,PRKGPLC1,YEARMADE,OCCUPYY,...,WDPELLET,WDOTHER,WDWARM,WDWATER,USESOLAR,SOLWATER,SOLOTHER,TVAUDIOEQUIP,PCOFFEQUIP,PHONE
0,2001,1001,1,1,2,1,31,1,9,9,...,9,0,0,0,0,0,0,8,2,3
1,2001,1002,3,7,2,0,10,1,7,9,...,9,0,0,0,0,0,0,7,2,3
2,2001,1003,4,9,2,0,20,1,4,1,...,9,0,0,0,0,0,0,9,0,3
3,2001,1004,1,1,2,1,31,1,1,2,...,0,0,1,0,0,0,0,12,6,3
4,2001,1005,4,9,2,0,10,1,3,9,...,9,0,0,0,0,0,0,6,2,3


In [17]:
set_option("precision",1)
df_recs.describe()

Unnamed: 0,RECSYEAR,DOEID,REGIONC,DIVISION,TYPEHUQ,CELLAR,STORIES,PRKGPLC1,YEARMADE,OCCUPYY,...,WDPELLET,WDOTHER,WDWARM,WDWATER,USESOLAR,SOLWATER,SOLOTHER,TVAUDIOEQUIP,PCOFFEQUIP,PHONE
count,26973.0,26973.0,26973.0,26973.0,26973.0,26973.0,26973.0,26973.0,26973.0,26973.0,...,26973.0,26973.0,26973.0,27000.0,27000.0,27000.0,27000.0,26973.0,26973.0,26973.0
mean,2008.2,6382.2,2.6,5.3,2.6,1.1,19.1,0.8,885.5,6.8,...,1.5,-1.1,0.1,0.0012,0.0093,0.0024,0.0059,6.9,2.9,1.6
std,4.6,4460.8,1.1,2.8,1.2,3.4,25.5,2.6,978.1,2.2,...,4.9,1.0,0.3,0.034,0.096,0.049,0.076,3.9,2.5,1.1
min,2001.0,1.0,1.0,1.0,1.0,-2.0,-2.0,-2.0,1.0,1.0,...,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2005.0,2585.0,2.0,3.0,2.0,0.0,10.0,0.0,4.0,6.0,...,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0
50%,2009.0,5064.0,3.0,5.0,2.0,0.0,10.0,0.0,8.0,8.0,...,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,2.0
75%,2009.0,10513.0,3.0,7.0,3.0,1.0,20.0,1.0,1970.0,8.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,4.0,2.0
max,2015.0,15686.0,4.0,10.0,5.0,9.0,99.0,9.0,2009.0,9.0,...,9.0,1.0,1.0,1.0,1.0,1.0,1.0,43.0,23.0,10.0


In [20]:
df_recs[df_recs.YEARMADE > 20]

Unnamed: 0,RECSYEAR,DOEID,REGIONC,DIVISION,TYPEHUQ,CELLAR,STORIES,PRKGPLC1,YEARMADE,OCCUPYY,...,WDPELLET,WDOTHER,WDWARM,WDWATER,USESOLAR,SOLWATER,SOLOTHER,TVAUDIOEQUIP,PCOFFEQUIP,PHONE
9204,2009,1,2,4,2,1,20,1,2004,8,...,-2,-2,0,0,0,0,0,11,5,2
9205,2009,2,4,10,2,0,31,1,1998,8,...,0,1,1,0,0,0,0,6,4,0
9206,2009,3,1,1,5,-2,-2,-2,1965,7,...,-2,-2,0,0,0,0,0,2,3,2
9207,2009,4,2,3,2,0,10,1,1985,5,...,-2,-2,0,0,0,0,0,9,4,2
9208,2009,5,1,1,3,1,20,0,1983,5,...,-2,-2,0,0,0,0,0,7,4,1
9209,2009,6,2,4,2,1,20,0,1920,5,...,-2,-2,0,0,0,0,0,3,5,1
9210,2009,7,1,2,2,1,20,0,1922,5,...,0,0,1,0,0,0,0,15,7,2
9211,2009,8,3,5,2,0,10,1,1975,8,...,-2,-2,0,0,0,0,0,3,3,0
9212,2009,9,1,2,3,1,20,0,1920,6,...,-2,-2,0,0,0,0,0,5,0,2
9213,2009,10,2,4,2,1,10,0,1955,7,...,-2,-2,0,0,0,0,0,6,3,2


In [17]:
#NGXBTU     is not required so drop the col
df.drop(labels = ['NGXBTU'], axis = 1, inplace = True)

In [20]:
df.isnull().values.sum()

0

In [23]:
df.describe()

Unnamed: 0,DOEID,REGIONC,DIVISION,TYPEHUQ,ZTYPEHUQ,CELLAR,ZCELLAR,BASEFIN,ZBASEFIN,ATTIC,...,PERIODEL,ZELAMOUNT,PERIODNG,ZNGAMOUNT,FOXBTU,PERIODFO,ZFOAMOUNT,LPXBTU,PERIODLP,ZLPAMOUNT
count,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,...,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0,5686.0
mean,12843.5,2.760816,5.670243,2.596025,0.0,-0.205593,-0.505276,-1.199261,-1.381815,-0.111854,...,1.454801,0.10904,0.181674,-0.731094,137.441423,-1.738305,-1.872318,91.33,-1.479071,-1.748505
std,1641.551147,1.004187,2.842655,1.164641,0.0,1.134775,0.880288,1.235166,0.933693,1.187953,...,1.250216,0.311716,2.197037,1.117215,0.142739,1.197667,0.558504,5.230054e-12,1.66304,0.76791
min,10001.0,1.0,1.0,1.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,1.0,0.0,-2.0,-2.0,135.0,-2.0,-2.0,91.33,-2.0,-2.0
25%,11422.25,2.0,3.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,1.0,0.0,-2.0,-2.0,137.45,-2.0,-2.0,91.33,-2.0,-2.0
50%,12843.5,3.0,5.0,2.0,0.0,0.0,0.0,-2.0,-2.0,0.0,...,1.0,0.0,1.0,0.0,137.45,-2.0,-2.0,91.33,-2.0,-2.0
75%,14264.75,4.0,8.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,137.45,-2.0,-2.0,91.33,-2.0,-2.0
max,15686.0,4.0,10.0,5.0,0.0,1.0,1.0,1.0,1.0,1.0,...,5.0,1.0,5.0,1.0,137.45,5.0,1.0,91.33,5.0,1.0


In [None]:
#separate out predictors
