## Data Imports 

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from multiprocessing import Pool

## Setup the Global Variables

In [2]:
num_processes = 3
alpha = 0.95
sample_size = 1000

## Load the Dataset

In [3]:
# Load the dataset
def load_data(file_path):
    return pd.read_csv(file_path)

## Function to Calculate the Confidence Interval

In [4]:
# Function to calculate confidence intervals

def calculate_ci(group, data):
    mean = data['Purchase'].mean()
    std_error = stats.sem(data['Purchase'])
    z_score = stats.norm.ppf(1 - alpha / 2)
    margin_of_error = z_score * std_error
    lower_bound = mean - margin_of_error
    upper_bound = mean + margin_of_error
    return group, mean, (lower_bound, upper_bound)

## Function to Map Age Groups to Numeric End-Points

In [5]:
# Custom function to map age group labels to numeric midpoints
def map_age_to_midpoint(age_group):
    age_midpoints = {'0-17': 8.5, '18-25': 21.5, '26-35': 30.5, '36-50': 43, '51+': 60}
    return age_midpoints.get(age_group)

## Data Cleaning and Summary Statistics

In [6]:
# Load the dataset using multiprocessing
with Pool(num_processes) as pool:
    df = pd.concat(pool.map(load_data, ['walmart_data.csv'] * num_processes))


In [None]:
# Calculate average spending per transaction for females and males

avg_female_expense = np.round(df[df['Gender'] == 'F']['Purchase'].mean(),4)
avg_male_expense = np.round(df[df['Gender'] == 'M']['Purchase'].mean(),4)

print("\nAverage Expenses by Gender:")
print(f"Average Female Expenses: {avg_female_expense}")
print(f"Average Male Expenses: {avg_male_expense}")



Average Expenses by Gender:
Average Female Expenses: 8734.5658
Average Male Expenses: 9437.526


In [None]:
# Calculate average spending per transaction for Married and Unmarried

avg_married_expense = np.round(df[df['Marital_Status'] == 1]['Purchase'].mean(),4)
avg_unmarried_expense = np.round(df[df['Marital_Status'] == 0]['Purchase'].mean(),4)

print("\nAverage Expenses by Gender:")
print(f"Average Married Expenses: {avg_married_expense}")
print(f"Average Unmarried Expenses: {avg_unmarried_expense}")


Average Expenses by Gender:
Average Married Expenses: 9261.1746
Average Unmarried Expenses: 9265.9076


Note:
--------


To determine which group (men or women / Married or Unmarried etc. ) is spending more, we can perform a post-hoc analysis or calculate the confidence intervals for the mean purchase amounts for these groups. 

The group with the higher mean in its confidence interval is spending more.

## Which Feature/ Category is most responsible for the maximum expenditure. 

In [None]:
# Create a new column 'Age_Group' based on the age midpoints
df['Age_Group'] = df['Age'].apply(map_age_to_midpoint)

# Define groups for parallel processing
groups = {
    'Gender': df.groupby('Gender'),
    'Marital_Status': df.groupby('Marital_Status'),
    'Age_Group': df.groupby('Age_Group')
}

# Use multiprocessing to calculate confidence intervals concurrently
with Pool(num_processes) as pool:
        results = pool.starmap(calculate_ci, [(group, data.sample(n=sample_size)) for group, data in groups.items()])


# Determine which group has the highest spending based on the upper bound of confidence intervals
max_spending_group = max(results, key=lambda x: x[2][1])

# Print the results
print("Spending Behavior by Groups:")
for group, mean, ci in results:
    print(f"{group}:")
    print(f"  Average: {mean:.2f}")
    print(f"  {100*(1-alpha):.2f}% CI: ({ci[0]:.2f}, {ci[1]:.2f})")

print(f"\nThe group with the highest spending is {max_spending_group[0]}.")


Spending Behavior by Groups:
Gender:
  Average: 9068.06
  5.00% CI: (9061.01, 9075.11)
Marital_Status:
  Average: 9056.57
  5.00% CI: (9049.51, 9063.63)
Age_Group:
  Average: 9154.63
  5.00% CI: (9148.78, 9160.49)

The group with the highest spending is Age_Group.


## Which Sub-Group is responsible for the maximum expenditure?

In [None]:
# Number of processes for multiprocessing
num_processes = 3

# Load the dataset using multiprocessing
with Pool(num_processes) as pool:
    df = pd.concat(pool.map(load_data, ['walmart_data.csv'] * num_processes))

# Create a new column 'Age_Group' based on the age midpoints
df['Age_Group'] = df['Age'].apply(map_age_to_midpoint)

# Define groups for parallel processing
groups = {
    'Gender': df,
    'Marital_Status': df,
    'Age_Group': df
}

# Initialize dictionaries to store sub-group results
sub_group_results = {}

# Use multiprocessing to calculate confidence intervals concurrently for each group and sub-group
with Pool(num_processes) as pool:
    for group, data in groups.items():
        sub_groups = data.groupby(group)
        sub_group_results[group] = pool.starmap(calculate_ci, [(sub_group, sub_data.sample(n=sample_size)) for sub_group, sub_data in sub_groups])

# Determine which sub-group is spending more within each category
max_spending_sub_groups = {}
for group, sub_group_results_list in sub_group_results.items():
    max_spending_sub_group = max(sub_group_results_list, key=lambda x: x[2][1])
    max_spending_sub_groups[group] = max_spending_sub_group

# Print the results
print("Spending Behavior by Sub-Groups:")
for group, max_spending_sub_group in max_spending_sub_groups.items():
    print(f"{group}:")
    print(f"  Sub-Group with Highest Spending: {max_spending_sub_group[0]}")
    print(f"  Average: {max_spending_sub_group[1]:.2f}")
    print(f"  {100*(1-alpha):.2f}% CI: ({max_spending_sub_group[2][0]:.2f}, {max_spending_sub_group[2][1]:.2f})")

Spending Behavior by Sub-Groups:
Gender:
  Sub-Group with Highest Spending: M
  Average: 9484.95
  5.00% CI: (9475.14, 9494.76)
Marital_Status:
  Sub-Group with Highest Spending: 1
  Average: 9378.46
  5.00% CI: (9368.36, 9388.57)
Age_Group:
  Sub-Group with Highest Spending: 30.5
  Average: 9021.11
  5.00% CI: (9011.28, 9030.95)
