In [31]:
import pandas as pd

In [32]:
def read_file(filepath, column_names):
    """
    Read a CSV file and return a DataFrame with specified column names.

    Parameters:
    filepath (str): The file path to the CSV file.
    column_names (list): A list of column names to assign to the DataFrame.

    Returns:
    pd.DataFrame: A DataFrame containing the data from the CSV file with the specified column names.
    """
    # Read the CSV file using pandas and specify the delimiter and column names
    data = pd.read_csv(filepath, delimiter='\t', names=column_names, index_col=0)

    return data

In [33]:
def map(filepath):
    """
    Map function for a MapReduce job that processes purchase data.

    This function reads purchase data from a file, sorts it by store, and outputs the store and cost information to a text file.

    Args:
        filepath (str): The path to the input purchase data file.

    Returns:
        pandas.DataFrame: A DataFrame containing mapped purchase data with columns 'store' and 'cost'.
    """
    # Read purchase data from the specified file, assuming column names 'date', 'time', 'store', 'product', 'cost', 'payment'.
    purchases = read_file(filepath=filepath, column_names=['date', 'time', 'store', 'product', 'cost', 'payment'])

    # Sort the purchase data by 'store' in ascending order and select only 'store' and 'cost' columns.
    # Then, write the result to a tab-separated text file without an index or header.
    _ = purchases.sort_values(by='store', ascending=True)[['store', 'cost']].to_csv('./purchases_mapper.txt', sep='\t', index=False, header=False)

    # Read the mapped purchase data from the text file with columns 'store' and 'cost'.
    mapped_purchases = read_file(filepath='./purchases_mapper.txt', column_names=['store', 'cost'])

    return mapped_purchases

In [34]:
mapped_purchases = map('./purchases.txt')
mapped_purchases

Unnamed: 0_level_0,cost
store,Unnamed: 1_level_1
Albuquerque,159.06
Albuquerque,123.65
Albuquerque,214.49
Albuquerque,374.30
Albuquerque,178.57
...,...
Winston–Salem,228.68
Winston–Salem,285.34
Winston–Salem,219.16
Winston–Salem,41.43


In [35]:
def reduce(mapped_data):
    """
    Reduce function for a MapReduce job that processes mapped purchase data.

    This function takes the mapped purchase data, groups it by 'store', and computes the total cost for each store.

    Args:
        mapped_data (pandas.DataFrame): A DataFrame containing mapped purchase data with columns 'store' and 'cost'.

    Returns:
        pandas.DataFrame: A DataFrame containing reduced purchase data with columns 'store' and 'cost'.
    """
    # Group the mapped data by 'store' and calculate the sum of 'cost' for each store.
    # Then, reset the index and write the result to a tab-separated text file without an index or header.
    _ = mapped_data.groupby('store').agg({'cost': 'sum'}).reset_index().to_csv('./purchases_reducer.txt', sep='\t', index=False, header=False)

    # Read the reduced purchase data from the text file with columns 'store' and 'cost'.
    reduced_purchases = read_file(filepath='./purchases_reducer.txt', column_names=['store', 'cost'])

    return reduced_purchases

In [36]:
reduced_purchases = reduce(mapped_data=mapped_purchases)
reduced_purchases

Unnamed: 0_level_0,cost
store,Unnamed: 1_level_1
Albuquerque,10052311.42
Anaheim,10076416.36
Anchorage,9933500.40
Arlington,10072207.97
Atlanta,9997146.70
...,...
Tulsa,10064955.90
Virginia Beach,10086553.50
Washington,10139363.39
Wichita,10083643.21
