In [1]:
# IMport dependencies
import pandas as pd
from pandas import set_option
import numpy
import os
import csv

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# import preprocessing from sklearn
from sklearn import preprocessing

# DictVectorizer
from sklearn.feature_extraction import DictVectorizer


def generateX(ohe = True, target = "BTU"):    

    dataFilePath = "dataforfinalproject"
    filename = "RECS_COMBINED_DATA.csv"
    cols_file = "Final_Columns_withCat.csv"

    # read dataset wih all years combined data
    df_recs = pd.read_csv(os.path.join(dataFilePath, filename), low_memory= False)


    # read the columns from Columns csv
    df_cols = pd.read_csv(os.path.join(dataFilePath, cols_file))
#     df_cols.columns

    # Whittle down the dataset to contain only Features required for modeling - X 
    modelDF = df_recs[df_cols[df_cols.FEATURES_MODEL == "Y"].COLUMN_NAME]
    print(f" X Features shape : {modelDF.shape}")

    y_label = df_recs['TOTALBTU']
    print(f"y label shape : {y_label.shape}")

    ### Prepare Data

    # describe the dataframe that will be used for model
    descrDF = modelDF[df_cols[(df_cols.FEATURES_MODEL == "Y")].COLUMN_NAME].describe()
    
    # transpose to make it easier to obtain columns with values of 99 and <0
    descrDF = descrDF.transpose().reset_index()

    # obtain column names with values 99. 99 indicates missing or unavailable info. this needs to be replaced with MOde
    cols99_2 = descrDF[(descrDF['max'] == 99.0) | (descrDF['min'] < 0) ]['index'].tolist()
    print(f"cols with values as 99 and -2: {cols99_2} \n")

   
    # For all categorical columns, that have 99 and -2 , replace with Columns Mode value#
    # step 1 - Fill na for thse values of 99 and -2
    # Step 2: Fillna with mode

    # step1 
    modelCopy = modelDF.copy()
    modelDF[cols99_2] = modelDF[cols99_2].applymap(lambda r : None if r in [99,-2] else r)


    #step2 :
    modelDF[cols99_2] = modelDF[cols99_2].fillna(modelDF.mode().iloc[0])

    # just for Col EDishw, the values are in -ve  (-9, -8 )so replace it in a separate line
    modelDF['ESDISHW'] = modelDF['ESDISHW'].apply(lambda r : 0 if (r < 0) else r)

    # check if NAN exists
    print(f"Duplicate Count : {modelDF.isnull().values.sum()}")


    modelDF[df_cols[(df_cols.FEATURES_MODEL == "Y") & (df_cols.COLUMN_TYPE == "Categorical")].COLUMN_NAME].describe()

    if(target == "BTU"):
        
        # Drop Price / Cost related Columns as it is only Consumption we are interested in 
        cost_cols = df_cols[(df_cols['COLUMN_NAME'].str.find("DOL") != -1) & (df_cols.FEATURES_MODEL == "Y")].COLUMN_NAME.tolist()
        modelDF.drop(cost_cols, axis = 1, inplace = True)
        
        # Drop All BTU related cols too
        btu_cols = df_cols[(df_cols['COLUMN_NAME'].str.find("BTU") != -1) & (df_cols.FEATURES_MODEL == "Y")].COLUMN_NAME.tolist()
        modelDF.drop(btu_cols, axis = 1, inplace = True)
        
        X = modelDF
        # and drop TOTAL BTU from X set
#         X = modelDF.drop(['TOTALBTU'], axis = 1)
    else:
        # Drop Price / Cost related Columns as it is only Consumption we are interested in 
        cost_cols = df_cols[(df_cols['COLUMN_NAME'].str.find("DOL") != -1) & (df_cols.FEATURES_MODEL == "Y")].COLUMN_NAME.tolist()
        modelDF.drop(cost_cols, axis = 1, inplace = True)
        
        # Also drop the Total BTU cols 
        btu_cols = df_cols[(df_cols['COLUMN_NAME'].str.find("TOTALBTU") != -1) & (df_cols.FEATURES_MODEL == "Y")].COLUMN_NAME.tolist()
        modelDF.drop(btu_cols, axis = 1, inplace = True)
        # and drop TOTAL DOLLAR from X set
#         X = modelDF.drop(['TOTALDOLLAR'], axis = 1)
        X = modelDF
        
    print(f"shape of X is {X.shape}")

    if(ohe):
        ### Apply dict vectorizer 
        # convert the X array into a dict
        X_dict = X.to_dict(orient = "records")
       

        # instantiate a Dictvectorizer object for X
        dv_X = DictVectorizer(sparse=False)   # sparse = False makes the output is not a sparse matrix

        # apply dv_X on X_dict
        X_encoded = dv_X.fit_transform(X_dict)
        
        vocab = dv_X.vocabulary_
        
        X_df = pd.DataFrame.from_dict(dv_X.inverse_transform(X_encoded))
        
        # return X_encoded
        return (X_encoded, vocab)
    else:
        return (X, X.columns)
    

In [2]:
X_enc = generateX(ohe = True, target = "DOLLAR")

 X Features shape : (22591, 87)
y label shape : (22591,)
cols with values as 99 and -2: ['CELLAR', 'STORIES', 'PRKGPLC1', 'BEDROOMS', 'FUELPOOL', 'SIZRFRI1', 'TYPERFR1', 'AGERFRI1', 'SIZRFRI2', 'TYPERFR2', 'AGERFRI2', 'NUMFREEZ', 'SIZFREEZ', 'AGEFRZR', 'STOVENFUEL', 'OVENUSE', 'AMTMICRO', 'DWASHUSE', 'WASHLOAD', 'WASHTEMP', 'EQUIPM', 'FUELHEAT', 'EQUIPAGE', 'PROTHERM', 'TEMPHOME', 'TEMPGONE', 'TEMPNITE', 'AGECENAC', 'USECENAC', 'NUMBERAC', 'WWACAGE', 'USEWWAC', 'FUELH2O', 'WHEATAGE', 'WHEATSIZ', 'LGTOUTNUM', 'ESDISHW', 'PGASHEAT', 'WOODAMT'] 

Duplicate Count : 0
shape of X is (22591, 86)


In [9]:
X_enc

(array([[  2.,   3.,   3., ...,   0.,   3.,   9.],
        [  2.,   1.,   3., ...,   0.,   3.,   7.],
        [  2.,   3.,   3., ...,   0.,   3.,   4.],
        ..., 
        [  3.,   3.,   1., ...,   0.,   1.,   7.],
        [  1.,  41.,   3., ...,   0.,   3.,   7.],
        [  2.,   3.,   3., ...,   0.,   3.,   4.]]),
        ADQINSUL  AGECENAC  AGEFRZR  AGERFRI1  AGERFRI2  AMTMICRO  BEDROOMS  \
 0           2.0       3.0      3.0       2.0       3.0       2.0       6.0   
 1           2.0       1.0      3.0       1.0       3.0       4.0       3.0   
 2           2.0       3.0      3.0       1.0       3.0       9.0       3.0   
 3           1.0       3.0      4.0       3.0       4.0       4.0       6.0   
 4           3.0       3.0      3.0       4.0       3.0       1.0       3.0   
 5           3.0       3.0      3.0       3.0       3.0       2.0       4.0   
 6           3.0       3.0      3.0       4.0       4.0       4.0       1.0   
 7           4.0       3.0      3.0       1.0 

In [4]:
X_inv.shape

(22591, 79)