In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from operator import attrgetter
import matplotlib.colors as mcolors

In [None]:
df = pd.read_csv('../input/online-retail-ii-uci/online_retail_II.csv',dtype={'CustomerID':str,
                                                                             'InvoiceID':str},
                parse_dates=['InvoiceDate'],
                infer_datetime_format=True)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.rename(columns={'Customer ID':'CustomerID'},inplace=True)

In [None]:
df.shape

In [None]:
df.dropna(subset=['CustomerID'],inplace=True)

In [None]:
# Benzersiz Sipariş Sayısı
n_orders = df.groupby(['CustomerID'])['Invoice'].nunique()
# Birden fazla verilen sipariş sayısının Müşteri Sayısına Bölümü
mult_orders_perc = np.sum(n_orders > 1) / df['CustomerID'].nunique()
print(f"Müşterilerin %{100 * mult_orders_perc:.2f}'i birden fazla sipariş vermiştir.")

In [None]:
ax = sns.displot(data=df,x=n_orders,kde=True,kind='hist')
ax.set(title='Distribution of number of orders per customer',
       xlabel = '# or orders',
       ylabel = '# of customers');

## Cohort Analysis

In [None]:
df = df[['CustomerID', 'Invoice', 'InvoiceDate']].drop_duplicates()

In [None]:
df['order_month'] = df['InvoiceDate'].dt.to_period('M')
df['cohort'] = df.groupby('CustomerID')['InvoiceDate'] \
                 .transform('min') \
                 .dt.to_period('M') 

In [None]:
df_cohort = df.groupby(['cohort', 'order_month']) \
              .agg(n_customers=('CustomerID', 'nunique')) \
              .reset_index(drop=False)
df_cohort['period_number'] = (df_cohort.order_month - df_cohort.cohort).apply(attrgetter('n'))

In [None]:
cohort_pivot = df_cohort.pivot_table(index = 'cohort',
                                     columns = 'period_number',
                                     values = 'n_customers')

In [None]:
cohort_size = cohort_pivot.iloc[:,0]
retention_matrix = cohort_pivot.divide(cohort_size, axis = 0)

In [None]:
with sns.axes_style("white"):
    fig, ax = plt.subplots(1, 2, figsize=(12, 8), sharey=True, gridspec_kw={'width_ratios': [1, 11]})
    
    # retention matrix
    sns.heatmap(retention_matrix, 
                mask=retention_matrix.isnull(), 
                annot=True, 
                fmt='.0%', 
                cmap='RdYlGn', 
                ax=ax[1])
    ax[1].set_title('Monthly Cohorts: User Retention', fontsize=16)
    ax[1].set(xlabel='# of periods',
              ylabel='')

    # cohort size
    cohort_size_df = pd.DataFrame(cohort_size).rename(columns={0: 'cohort_size'})
    white_cmap = mcolors.ListedColormap(['white'])
    sns.heatmap(cohort_size_df, 
                annot=True, 
                cbar=False, 
                fmt='g', 
                cmap=white_cmap, 
                ax=ax[0])

    fig.tight_layout()