In [None]:
import pandas as pd
import ast
from collections import defaultdict
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from scipy.stats import norm
import gender_guesser.detector as gender
predictor = gender.Detector()

In [None]:
walmart = pd.read_csv('walmart_data_new.csv')

In [None]:
# Read this again and recreate once more!!!!!!
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm

# Set style
sns.set(style="whitegrid")

# Create a 2x2 grid of subplots
fig, axis = plt.subplots(nrows=2, ncols=2, figsize=(14, 10))
fig.subplots_adjust(top=0.9)
fig.suptitle("Distribution of Selected Walmart Features", fontsize=16, color="#333333")

# Plot distribution plots for each specified column
sns.histplot(walmart['Occupation'], kde=True, ax=axis[0, 0], color="#900000", bins=20)
axis[0, 0].set_title("Occupation Distribution")
axis[0, 0].set_xlabel("Occupation")

sns.histplot(walmart['Stay_In_Current_City_Years'].astype(int), kde=True, ax=axis[0, 1], color="#900000", bins=6)
axis[0, 1].set_title("Stay in Current City (Years)")
axis[0, 1].set_xlabel("Years")

sns.histplot(walmart['Marital_Status'], kde=True, ax=axis[1, 0], color="#900000", bins=3)
axis[1, 0].set_title("Marital Status Distribution")
axis[1, 0].set_xlabel("Marital Status")

# Plot Purchase distribution with normal fit
sns.histplot(walmart['Purchase'], kde=False, stat="density", ax=axis[1, 1], color="#900000", bins=30)
mu, sigma = norm.fit(walmart['Purchase'])
xmin, xmax = axis[1, 1].get_xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, sigma)
axis[1, 1].plot(x, p, 'k--', linewidth=2)
axis[1, 1].set_title("Purchase Distribution with Normal Fit")
axis[1, 1].set_xlabel("Purchase Amount")
axis[1, 1].legend(['Normal Fit (μ = {:.2f}, σ = {:.2f})'.format(mu, sigma)])

# Display the mean and std
print("The mu (mean) is {:.2f} and sigma (standard deviation) is {:.2f} for the curve".format(mu, sigma))

# Show plot
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

# Set up figure and axes
fig, axes = plt.subplots(4, 2, figsize=(20, 24))
fig.suptitle("Bar Graphs", fontsize=20)

# Define columns to plot
columns = [
    'Gender', 'Age', 'Occupation', 'City_Category',
    'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category', 'Purchase'
]

# Generate colors from Spectral colormap
spectral = cm.get_cmap('Spectral', len(columns))
colors = [spectral(i) for i in range(len(columns))]

# Loop through each subplot
for ax, column, color in zip(axes.flatten(), columns, colors):
    if walmart[column].dtype == 'object' or column != 'Purchase':
        counts = walmart[column].value_counts()
        ax.bar(counts.index.astype(str), counts.values, color=color)
        ax.set_title(column, fontsize=14)
        ax.set_ylabel('Count', fontsize=12)
        ax.set_xticks(np.arange(len(counts.index)))
        ax.set_xticklabels(counts.index.astype(str), rotation=45, ha='right', fontsize=10)
    else:
        ax.hist(walmart[column], bins=30, color=color)
        ax.set_title(column, fontsize=14)
        ax.set_ylabel('Frequency', fontsize=12)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots
fig = make_subplots(
    rows=4, cols=2,
    subplot_titles=("Gender", "Age", "Occupation", "City Category",
                    "Stay In Current City Years", "Marital Status", "Product Category", "Purchase")
)

# Add histograms for each subplot
fig.add_trace(go.Histogram(x=walmart['Gender']), row=1, col=1)
fig.add_trace(go.Histogram(x=walmart['Age']), row=1, col=2)
fig.add_trace(go.Histogram(x=walmart['Occupation']), row=2, col=1)
fig.add_trace(go.Histogram(x=walmart['City_Category']), row=2, col=2)
fig.add_trace(go.Histogram(x=walmart['Stay_In_Current_City_Years']), row=3, col=1)
fig.add_trace(go.Histogram(x=walmart['Marital_Status']), row=3, col=2)
fig.add_trace(go.Histogram(x=walmart['Product_Category']), row=4, col=1)
fig.add_trace(go.Histogram(x=walmart['Purchase']), row=4, col=2)

fig.update_layout(height=1200, width=1000, title_text="Count Plots")
fig.update_layout(showlegend=False)  # Hide the legend if not needed
fig.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set seaborn and matplotlib settings
sns.set(style="whitegrid", color_codes=True)

# Get spectral color palette
spectral_palette = sns.color_palette("Spectral", n_colors=5)

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))
fig.subplots_adjust(hspace=0.4, wspace=0.3)

sns.boxplot(data=walmart, x='Gender', y='Purchase', hue='Age', ax=axs[0, 0], palette=spectral_palette)
axs[0, 0].set_title('Purchase by Gender and Age')

sns.boxplot(data=walmart, x='Gender', y='Purchase', hue='City_Category', ax=axs[0, 1], palette=spectral_palette)
axs[0, 1].set_title('Purchase by Gender and City Category')

sns.boxplot(data=walmart, x='Gender', y='Purchase', hue='Marital_Status', ax=axs[1, 0], palette=spectral_palette)
axs[1, 0].set_title('Purchase by Gender and Marital Status')

sns.boxplot(data=walmart, x='Gender', y='Purchase', hue='Stay_In_Current_City_Years', ax=axs[1, 1], palette=spectral_palette)
axs[1, 1].set_title('Purchase by Gender and Stay Duration')
axs[1, 1].legend(loc='upper left', title='Stay In Years')

# Super title for the entire figure
plt.suptitle("Boxplots of Purchase Behavior by Gender and Various Demographics", fontsize=16, y=1.02)

plt.show()

In [None]:
attrs = ['Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category']
sns.set(color_codes = True)
#Use seaborn here because it is easy for boxplots and histograms rather than Matplotlib
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(20, 16))
fig.subplots_adjust(top=1.3)
count = 0
for row in range(3):
    for col in range(2):
        sns.boxplot(data=walmart, y='Purchase', x=attrs[count], ax=axs[row, col])
        axs[row,col].set_title(f"Purchase vs {attrs[count]}", pad=12, fontsize=13)
        count += 1
plt.show()

plt.figure(figsize=(10, 8))
sns.boxplot(data=walmart, y='Purchase', x=attrs[-1])
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Grouping by user and gender to calculate total amount spent per customer
amt_df = walmart.groupby(['User_ID', 'Gender'])[['Purchase']].sum()
avg_amt_df = amt_df.reset_index()

# Average spend by gender
male_avg = avg_amt_df[avg_amt_df['Gender'] == 'M']['Purchase'].mean()
female_avg = avg_amt_df[avg_amt_df['Gender'] == 'F']['Purchase'].mean()

print("Average amount spent by Male customers: {:.2f}".format(male_avg))
print("Average amount spent by Female customers: {:.2f}".format(female_avg))

# Create gender-specific datasets
male_df = avg_amt_df[avg_amt_df['Gender'] == 'M']
female_df = avg_amt_df[avg_amt_df['Gender'] == 'F']

# Sampling parameters
male_sample_size = 3000
female_sample_size = 1500
num_repitions = 1000
male_means = []
female_means = []

# Bootstrapping to estimate sampling distribution of the mean
for _ in range(num_repitions):
    male_mean = male_df.sample(male_sample_size, replace=True)['Purchase'].mean()
    female_mean = female_df.sample(female_sample_size, replace=True)['Purchase'].mean()
    
    male_means.append(male_mean)
    female_means.append(female_mean)

# Plotting histograms
fig, axis = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
axis[0].hist(male_means, bins=100, color='steelblue')
axis[1].hist(female_means, bins=100, color='lightcoral')
axis[0].set_title("Male - Distribution of Means, Sample Size: 3000")
axis[1].set_title("Female - Distribution of Means, Sample Size: 1500")
plt.show()


print("\nMale - Sample mean: {:.2f}, Sample std: {:.2f}".format(male_df['Purchase'].mean(), male_df['Purchase'].std()))
print("Female - Sample mean: {:.2f}, Sample std: {:.2f}".format(female_df['Purchase'].mean(), female_df['Purchase'].std()))