In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objs as go

from datetime import date

In [None]:
data = pd.read_csv('/kaggle/input/ecommerce-purchase-history-from-electronics-store/kz.csv')

data.head(5)

In [None]:
data.dtypes

Cleaning the Data

In [None]:
data.set_index('order_id', inplace=True)

In [None]:
null_columns = data.columns[data.isnull().any()]
data[null_columns].isnull().sum()

In [None]:
n_unique_products = data['product_id'].nunique()
n_unique_users = data['user_id'].nunique()
print('Number of unique users: ' + str(n_unique_users) +'. Number of unique products is: ' + str(n_unique_products))

In [None]:
data['event_time']=pd.to_datetime(data['event_time'])

In [None]:
data.dropna(subset=['category_code'],inplace=True)

In [None]:
#Getting the main category from the category_code
data['category'] = data['category_code'].str.rsplit('.', n=1, expand=True)[1]
data.drop(columns=['category_code'], inplace=True)

# Analyzing data

****Best performing brands

In [None]:
#best performing brands
best_performing_brands = data.groupby('brand')['price'].sum().reset_index().sort_values('price', ascending=False).head(10)

In [None]:
fig = px.bar(
    best_performing_brands, 
    x='brand', 
    y='price', 
    title='Best performing brands',
    width=800, 
    height=800
)

fig.show()

In [None]:
#most sold brands
most_sold_brands = data.groupby('brand')['price'].agg('count').reset_index().sort_values('price', ascending=False).head(10)
most_sold_brands.rename(columns={"brand": "brand", "price": "times_sold"}, inplace=True)

In [None]:
fig = px.bar(
    most_sold_brands, 
    x='brand', 
    y='times_sold', 
    title='Most sold brands',
    width=800, 
    height=800
)

fig.show()

In [None]:
#most bought categories
most_sold_categories = data.groupby('category')['price'].agg('count').reset_index().sort_values('price', ascending=False).head(10)
most_sold_categories.rename(columns={"category": "category", "price": "times_sold"}, inplace=True)

In [None]:
fig = px.bar(
    most_sold_categories, 
    x='category', 
    y='times_sold', 
    title='Most sold categories',
    width=800, 
    height=800
)

fig.show()

In [None]:
#best time for purchase. filter with lambda <900 as a lot of data was on 1970-1-1 12 am
best_time = data.groupby(data['event_time'].dt.strftime('%r'))['price'].sum().sort_values()[lambda x: x<= 900000].tail(10)

In [None]:
fig = px.bar(
    best_time,
    x="price", 
    orientation='h', 
    title="At what time most of the purchases were made", 
    width=800, 
    height=800
)

fig.show()

In [None]:
#montly purchases
best_month = data.groupby(data['event_time'].dt.strftime('%B'))['price'].sum().sort_values()

In [None]:
fig = px.bar(
    best_month,
    x="price", 
    orientation='h', 
    title="In which months most of the purchases were made", 
    width=800, 
    height=800
)

fig.show()

In [None]:
# How much money spent 20% of top buyers in comparison with other 80% of clients
most_active_users = data.groupby('user_id')['price'].sum().reset_index().sort_values('price', ascending=False).head(round((data['user_id'].nunique())*0.2))
least_active_users = data.groupby('user_id')['price'].sum().reset_index().sort_values('price', ascending=False).tail(round((data['user_id'].nunique())*0.8))
top_20_percent_buyers = most_active_users['price'].sum()
bottom_80_percent_buyers = least_active_users['price'].sum()
last_data = pd.DataFrame(data={'most_active': [most_active_users['price'].sum()], 'least_active': [least_active_users['price'].sum()]})

In [None]:
fig = px.bar(
    last_data,  
    title="20% of most active in comparison with other 80% of buyers", 
    width=800, 
    height=800
)

fig.show()