In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd

Define paths

In [None]:
DATASET_DIR = 'dataset'

data_1_filename = 'transactions_1.csv'
data_2_filename = 'transactions_2.csv'

data_1_filepath = os.path.join('..', DATASET_DIR, data_1_filename)
data_2_filepath = os.path.join('..', DATASET_DIR, data_2_filename)

Load data

In [None]:
df_data_1 = pd.read_csv(data_1_filepath, index_col=0)
df_data_2 = pd.read_csv(data_2_filepath, index_col=0)

General data overview

In [None]:
df_data_1.head()

In [None]:
df_data_1.shape

In [None]:
df_data_2.head()

In [None]:
df_data_2.shape

In [None]:
df_data_1["date"] = pd.to_datetime(df_data_1["date"])
df_data_2["date"] = pd.to_datetime(df_data_2["date"])

In [None]:
df_data_1['date'].max(), df_data_2['date'].min()

In [None]:
plt.hist(
    [df_data_1['date'], df_data_2['date']],
    bins=20,
    stacked=True,
    label=[data_1_filename, data_2_filename],
    color=['blue', 'orange']
)
plt.legend()
plt.xticks(rotation=90)

In [None]:
df_data = pd.concat([df_data_1, df_data_2])

In [None]:
df_data.duplicated().sum() # I assume that duplicates are not a mistake, but that a given customer bought several cars

In [None]:
df_data['date'].min(), df_data['date'].max()

In [None]:
df_data['customer_id'].unique()

In [None]:
df_data['customer_id'].unique().min(), df_data['customer_id'].unique().max()

### Solve tasks

task 1: `Create an ordered (descending) plot that shows the total number of transactions per customer from the most active customer to the least active one.`

In [None]:
df_transactions_per_customer = df_data.groupby("customer_id").size().reset_index(name="transaction_count")
df_transactions_per_customer.sort_values(by="transaction_count", ascending=False, inplace=True)

In [None]:
plt.bar(df_transactions_per_customer["customer_id"].astype(str), df_transactions_per_customer["transaction_count"])

task 2: `Given any product ID, create a plot to show its transaction frequency per month for the year 2018.`

In [None]:
df_data_2018 = df_data[df_data["date"].dt.year == 2018]

In [None]:
df_data['product_id'].unique().tolist()

In [None]:
product_id = "Opel"

In [None]:
product_data = df_data_2018[df_data_2018["product_id"] == product_id]

product_data["year_month"] = product_data["date"].dt.to_period("M")
monthly_transactions = product_data.groupby("year_month").size().reset_index(name="transaction_count")

plt.plot(monthly_transactions["year_month"].astype(str), monthly_transactions["transaction_count"], marker="o", color="skyblue")

task 4: `At any time, what are the top 5 products that drove the highest sales over the last six months? Do you see a seasonality effect in this data set?`

In [None]:
chosen_date = '2019-01-01'

In [None]:
chosen_date = pd.Timestamp(chosen_date).tz_localize('UTC')

six_months_earlier = chosen_date - pd.DateOffset(months=6)
df_data_last_six_months = df_data[(df_data["date"] >= six_months_earlier) & (df_data["date"] <= chosen_date)]

product_sales = df_data_last_six_months.groupby("product_id").size().reset_index(name="transaction_count")
top_products = product_sales.sort_values(by="transaction_count", ascending=False).head(5)

In [None]:
plt.bar(top_products["product_id"], top_products["transaction_count"], color="skyblue")