<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Manipulation-Classes" data-toc-modified-id="Data-Manipulation-Classes-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Manipulation Classes</a></span></li></ul></div>

# Data Manipulation Classes

This notebooks collects the helper methods to be used along the different lessons.

In [6]:
import pandas as pd
import numpy as np

In [34]:
class Dataset:
    """
    This class allows a simpler representation of the dataset used
    to build a model in class. It allows loading a remote CSV by
    providing an URL to the initialization method of the object.

        my_data = Dataset(URL)
        
    """
    
    meta = None
    target = None
    features = None
    
    def __init__(self, data_location):
        self.data = pd.read_csv(data_location)
        self.features = list(self.data)
        
    def set_target(self, target):
        self.features.remove(target)
        self.target = target
        
    def metainfo(self):
        meta = dict()
        descr = pd.DataFrame({'dtype': self.data.dtypes, 
                              'NAs': self.data.isna().sum()})
        categorical_features = descr.loc[descr['dtype'] == 'object'].\
            index.values.tolist()
        numerical_features = descr.loc[descr['dtype'] != 'object'].\
            index.values.tolist()
        numerical_features_na = descr.loc[(descr['dtype'] != 'object') & 
                                          (descr['NAs'] > 0)].\
            index.values.tolist()
        categorical_features_na = descr.loc[(descr['dtype'] == 'object') & 
                                            (descr['NAs'] > 0)].\
            index.values.tolist()
        complete_features = descr.loc[descr['NAs'] == 0].index.values.tolist()
        meta['description'] = descr
        meta['categorical'] = categorical_features
        meta['categorical'] = categorical_features
        meta['categorical_na'] = categorical_features_na
        meta['numerical'] = numerical_features
        meta['numerical_na'] = numerical_features_na
        meta['complete'] = complete_features
        self.meta = meta
        return self

    def describe(self):
        if not self.meta:
            self.metainfo()
        print('Available types:', self.meta['description']['dtype'].unique())
        print('{} Features'.format(self.meta['description'].shape[0]))
        print('{} categorical features'.format(
            len(self.meta['categorical'])))
        print('{} numerical features'.format(
            len(self.meta['numerical'])))
        print('{} categorical features with NAs'.format(
            len(self.meta['categorical_na'])))
        print('{} numerical features with NAs'.format(
            len(self.meta['numerical_na'])))
        print('{} Complete features'.format(
            len(self.meta['complete'])))
        print('--')
        print('Target: {}'.format(
            self.target if self.target is not None else 'Not set'))
        
    def show(self, f_list=None):
        if f_list is None:
            f_list = self.features
        max_width = 80
        num_features = len(f_list)
        max_length = max([len(feature) for feature in f_list])
        max_fields = int(np.floor(max_width / (max_length+1)))
        col_width = max_length + 1

        print('-'*((max_fields*max_length)+(max_fields-1)))
        for field_idx in range(int(np.ceil(num_features/max_fields))):
            from_idx = field_idx*max_fields
            to_idx = (field_idx*max_fields)+max_fields
            if to_idx > num_features:
                to_idx = num_features
            format_str = ''
            for i in range(to_idx-from_idx):
                format_str += '{{:<{:d}}}'.format(col_width)
            print (format_str.format(*f_list[from_idx:to_idx]))
        print('-'*((max_fields*max_length)+(max_fields-1)))

In [35]:
class Split:
    """
    This class represents a split from a dataset, it will assign
    each dataframe partition passed as argument to a different 
    attribute of the class: 'train', 'test' (and 'validation').
    The class method 'split' performs the splitting of the dataframe
    passed, according to the parameters passed.
    
    Example:
    
        X, Y = Split.split(my_data, my_target)
        
    """
    split_name = ['train', 'test', 'validation']
    
    def __init__(self, splits):
        for index, partition in enumerate(splits):
            setattr(self, self.split_name[index], partition)

    @classmethod
    def split(cls,
              data, 
              target, 
              seed=1024, 
              test_size=0.2, 
              validation_split=False):
        """
        From an input dataframe, separate features from target, and 
        produce splits (with or without validation).
        """
        features = list(data)
        features.remove(target)
        X = pd.DataFrame(data, columns=features)
        Y = pd.DataFrame(data.loc[:, target])

        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y, 
            test_size=test_size, random_state=seed)

        if validation_split is True:
            X_train, X_val, Y_train, Y_val = train_test_split(
                X_train, Y_train, 
                test_size=test_size, random_state=seed)
            X_splits = [X_train, X_test, X_val]
            Y_splits = [Y_train, Y_test, Y_val]
        else:
            X_splits = [X_train, X_test]
            Y_splits = [Y_train, Y_test]

        return Split(X_splits), Split(Y_splits)

In [38]:
houses = Dataset('./data/houseprices_prepared.csv.gz')
houses.set_target('SalePrice')
houses.describe()

Available types: [dtype('int64') dtype('O') dtype('float64')]
80 Features
43 categorical features
37 numerical features
16 categorical features with NAs
0 numerical features with NAs
64 Complete features
--
Target: SalePrice


In [39]:
houses.show(houses.meta['categorical_na'])

-----------------------------------------------------------------------------
Alley        MasVnrType   BsmtQual     BsmtCond     BsmtExposure BsmtFinType1 
BsmtFinType2 Electrical   FireplaceQu  GarageType   GarageFinish GarageQual   
GarageCond   PoolQC       Fence        MiscFeature  
-----------------------------------------------------------------------------
