In [1]:
from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import seaborn as sns

%matplotlib notebook

In [None]:
df = pd.read_csv("../data/BlackFriday.csv")

In [None]:
df.head()

In [None]:
## Remove capitals from column names

df.columns = [col.lower() for col in df.columns]
df.columns

In [None]:
## Check dtypes

df.info()

In [None]:
## Check value counts in product categories

df["product_category_2"].value_counts().sort_index()

In [None]:
## Fill missing Values in product_category_2 and product_category_3 with 0

df["product_category_2"] = df["product_category_2"].fillna(0)
df["product_category_3"] = df["product_category_3"].fillna(0)

In [None]:
df.info()

In [None]:
## Check Occupation, Marital_Status and Product Categories

df["occupation"].value_counts().sort_index()

In [None]:
## Convert user_id to str and product categories to int

df["user_id"] = df["user_id"].astype(object)

cols_to_int = ["product_category_2", "product_category_3"]

for col in cols_to_int:
    df[col] = df[col].astype(int)

In [None]:
df.dtypes

In [None]:
## unique customer count

len(df["user_id"].unique())

In [None]:
srs_transaction_count = df["user_id"].value_counts()
srs_transaction_count.head()

In [None]:
## plot histogram for number of bought items per customer

plt.figure()
sns.distplot(srs_transaction_count.values, bins=100, kde=False)
plt.show()

In [None]:
age_counts = df["age"].value_counts().sort_index()
age_counts

In [None]:
## plot a barplot of the age counts
plt.figure()
sns.barplot(age_counts.index, age_counts.values)
plt.show()

In [None]:
## How many customers were per age group

customer_counts = df.groupby("user_id")["age"].apply(max).value_counts().sort_index()

In [None]:
plt.figure()
sns.barplot(customer_counts.index, customer_counts.values)
plt.show()

In [None]:
## How much money did the customers spend per age group

age_spent = df.groupby("age")["purchase"].apply(sum)
age_spent

In [None]:
plt.figure()
sns.barplot(age_spent.index, age_spent.values)
plt.show()

In [None]:
age_avg = df.groupby("age")["purchase"].apply(np.mean)
age_avg

In [None]:
plt.figure()
sns.barplot(age_avg.index, age_avg.values)
plt.show()

In [None]:
## transaction count per city category
city_srs = df["city_category"].value_counts()
city_srs

In [None]:
plt.figure()
sns.barplot(city_srs.index, city_srs.values)
plt.show()

In [None]:
srs_purchase_sum = df.groupby("user_id")["purchase"].apply(sum).reset_index()
srs_categories = df.groupby("user_id")[["marital_status", "gender", "age", "city_category", "stay_in_current_city_years" ]].apply(max).reset_index()
df_unique = srs_categories.merge(srs_purchase_sum, on="user_id").sort_values("age")
df_unique.head()

In [None]:
plt.figure()
sns.pointplot(x="age", y="purchase", hue="gender", data=df_unique)
plt.show()

In [None]:
##boxplot of spent money per city category and gender

plt.figure()
sns.boxplot(x="city_category", y="purchase", hue="gender", data=df_unique, palette="PRGn")
plt.show()

In [None]:
##boxplot of spent money per city category and gender

plt.figure()
sns.boxplot(x="stay_in_current_city_years", y="purchase", hue="gender", data=df_unique, palette="PRGn")
plt.show()

In [None]:
plt.figure()
sns.pointplot(x="product_category_1", y="purchase", data=df, estimator=np.mean)
plt.show()

In [None]:
srs_category_ratio = df.groupby("product_category_1").apply(lambda x: (len(x) / len(df))*100).reset_index()
srs_category_mean = df.groupby("product_category_1")["purchase"].apply(np.mean).reset_index()

df_category_1 = srs_category_ratio.merge(srs_category_mean, on="product_category_1")
df_category_1.columns =["category", "purchase_ratio", "avg_purchase"]
df_category_1.head()

In [None]:
sns.pairplot(df_category_1[["purchase_ratio", "avg_purchase"]])
plt.show()

In [None]:
sns.jointplot(x="purchase_ratio", y="avg_purchase", data=df_category_1, kind="reg", size=7)
plt.show()

In [None]:
product_count = df.groupby("product_id").apply(len).reset_index()
product_price = df.groupby("product_id")["purchase"].apply(np.mean).reset_index()

df_product = product_count.merge(product_price, on="product_id")
df_product.columns=["product_id", "purchase_count", "avg_price"]
df_product.head()

In [None]:
sns.pairplot(df_product[["purchase_count", "avg_price"]])
plt.show()

In [None]:
sns.jointplot(x="purchase_count", y="avg_price", data=df_product, kind="reg", size=7)
plt.show()