# Data Processing

In this notebook we follow 1.0-theberling_data_explore_2021 and try to functionalize the data processing steps contained there.

## Imports

In [1]:
import pandas as pd
from pathlib import Path

## Data Loader Class

In [2]:
class DataIO(object):
    """Ecapsulates data loading functionality.
    """
    
    def __init__(self, input_folder):
        """Set the input folder on instantiation.
        """
        self.input_folder = input_folder
        
    def load_dataframe(self, filename):
        """Load the csv with the given file name from the 
        input folder as a DataFrame.
        """
        file_path = Path(self.input_folder).joinpath(filename)
        df = pd.read_csv(file_path)
        return df

### Instantiate Loader Object

In [3]:
input_folder = '../../data/raw/MDataFiles_Stage1/'
loader = DataIO(input_folder)

## Load in the data

In [4]:
df_regseasonDets = loader.load_dataframe('MRegularSeasonDetailedResults.csv')
df_rankings = loader.load_dataframe('MMasseyOrdinals.csv')
df_seeds = loader.load_dataframe('MNCAATourneySeeds.csv')

In [5]:
columns = df_regseasonDets.columns
winner_columns = [column for column in columns if column.startswith('W')]
loser_columns = [column for column in columns if column.startswith('L')]
game_columns = [column for column in columns if not (column[0] in ['L', 'W'])]
assert len(columns) == len(winner_columns) + len(loser_columns) + len(game_columns), "You dropped columns."
assert set(winner_columns).intersection(loser_columns) == set(), "Winner and loser columns overlap."
assert set(winner_columns).intersection(game_columns) == set(), "Winner and game columns overlap."
assert set(game_columns).intersection(loser_columns) == set(), "Game and loser columns overlap."