Given the structure of the 'Category Names' column, we can easily classify certain types of liquors into types. This lends itself well to one-hot encoding categories for quick data retrieval and analysis. 

In [1]:
import pandas as pd
import glob


#get version 3 files
files_list = glob.glob('V3*.csv')

#read files into a dictionary
data_frames = {file: pd.read_csv(file) for file in files_list}


Here is the function we will use to generate columns based on the 'Category Name' column. For instance, the 'Imports' column will show a 1 if the 'Category Name' contains 'IMPORTED','SCOTCH', 'JAMAICA', 'BARBADOS', or 'JAPANESE' in the string. All of these boolean category columns were created by carefully examining the value counts of the various categories appearing in the 'Category Name' column.

In [2]:
def create_columns(data):
    #create imports column based on Category Name' values
    data['Imports'] = (
        data['Category Name'].str.contains('IMPORTED', case=False) |
        data['Category Name'].str.contains('SCOTCH', case=False) |
        data['Category Name'].str.contains('JAMAICA', case=False) |
        data['Category Name'].str.contains('BARBADOS', case=False) |
        data['Category Name'].str.contains('JAPANESE', case=False)
    ).astype(int)

    #create vodka column
    data['Vodka'] = data['Category Name'].str.contains('VODKA', case=False).astype(int)

    #create whisky column
    data['Whisky'] = (
        data['Category Name'].str.contains('WHISKIES|WHISKY|BOURBON', case=False)
    ).astype(int)

    #create rum column
    data['Rum'] = data['Category Name'].str.contains('RUM', case=False).astype(int)

    #create liquer column
    data['Liqueur'] = data['Category Name'].str.contains('LIQUEUR', case=False).astype(int)

    #create tequila column
    data['Tequila'] = data['Category Name'].str.contains('TEQUILA', case=False).astype(int)

    #create gin column
    data['Gin'] = data['Category Name'].str.contains('GIN', case=False).astype(int)

    #create brandy column
    data['Brandy'] = data['Category Name'].str.contains('BRANDIES', case=False).astype(int)

    #create schnapps column
    data['Schnapps'] = data['Category Name'].str.contains('SCHNAPPS', case=False).astype(int)

    # Create scotch column
    data['Scotch'] = data['Category Name'].str.contains('SCOTCH', case=False).astype(int)

    #create specialty column
    data['Specialty'] = data['Category Name'].str.contains('SPECIALTY', case=False).astype(int)

    #create special order column
    data['Special Order'] = data['Category Name'].str.contains('SPECIAL ORDER', case=False).astype(int)

    return data


After we created function, we can apply it to all of our files in our dataset. Then we output those files as csv files in our directory.

In [3]:
#apply create function to all columns in dataframe
for file, dataframe in data_frames.items():
    data_frames[file] = create_columns(dataframe)

In [None]:
#output version 3 files
for file, dataframe in data_frames.items():
    new_filename = file.replace('V3', 'V4')
    dataframe.to_csv(new_filename, index=False)