In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from zipfile import ZipFile
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import check_output
from scipy import stats
from scipy.stats import skew, norm, probplot, boxcox
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 1. Read and Understand Data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input/instacart-market-basket-analysis/'):
    for filename in filenames:        
        archive = ZipFile(os.path.join(dirname, filename), mode='r')
        archive.extractall(path="/kaggle/working")
        archive.close()

print(check_output(["ls", "../working"]).decode("utf8"))

## 1.1 Order Data

In [None]:
order = pd.read_csv('./orders.csv')
order_products_prior = pd.read_csv('./order_products__prior.csv')
order_products_train = pd.read_csv('./order_products__train.csv')

## 1.2 Related Data

In [None]:
deparment = pd.read_csv('./departments.csv')
product = pd.read_csv('./products.csv')
aisle = pd.read_csv('./aisles.csv')

#### Overview về dataset:

1.Dataset order chứa thông tin về đơn hàng như: 

     - order_id: ID của đơn hàng.
     - user_id: ID của khách hàng.
     - eval_set: kiểu dataset của dòng dữ liệu. Ở đây có thể là: train, prior, test.
     - order_number: số thức tự đơn hàng của khách hàng.
     - order_dow: ngày đặt hàng trong tuần.
     - order_hour_of_day: thời gian giờ đặt hàng trong ngày.
     - day_since_prior_order: khoảng cách thời gian so với lần đặt hàng trước.

2. Dataset order_products_prior và order_products_train: sẽ cùng một kiểu thông tin chỉ khác là dành cho tập data train hay prior:

    - order_id: ID của đơn hàng.
    - product_id: ID của sản phẩm.
    - add_to_cart_order: thứ tự thêm vào giỏ hàng của đơn hàng.
    - reordered: sản phẩm trong đơn hàng được đặt lại.

## 1.3 Reduce datasize

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        col_type2 = df[col].dtype.name
        
        if ((col_type != object) and (col_type2 != 'category')):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# order = reduce_mem_usage(order)
# order_products_prior = reduce_mem_usage(order_products_prior)
# order_products_train = reduce_mem_usage(order_products_train)
# deparment = reduce_mem_usage(deparment)
# product = reduce_mem_usage(product)
# aisle = reduce_mem_usage(aisle)

In [None]:
order.describe().T

In [None]:
print('Data size of the order Dataset is {}'.format(order.shape))
print('Data size of the order_products_prior Dataset is {}'.format(order_products_prior.shape))
print('Data size of the order_products_train Dataset is {}'.format(order_products_train.shape))
print('Data size of the products Dataset is {}'.format(product.shape))
print('Data size of the departments Dataset is {}'.format(deparment.shape))
print('Data size of the aisles Dataset is {}'.format(aisle.shape))

In [None]:
order[order.order_id == 2539329]

In [None]:
order_products_prior[(order_products_prior.order_id == 3343014)]

In [None]:
order[order.order_id == 1187899]

In [None]:
order_products_train[(order_products_train.order_id == 1187899)]

In [None]:
# deal with missing value of the column day_since_prior_order
order.days_since_prior_order = order.days_since_prior_order.fillna(0)

## get information about the number line, number line by reordered of each order on eval_set = train
train_0 = order_products_train[order_products_train.reordered == 0].groupby(['order_id','reordered'])['product_id'].count().reset_index()
train_1 = order_products_train[(order_products_train.reordered == 1)].groupby(['order_id','reordered'])['product_id'].count().reset_index()
train_0.rename(columns={'product_id':'reordered_0'}, inplace=True)
train_0 = train_0.drop('reordered', axis = 1)
train_1.rename(columns={'product_id':'reordered_1'}, inplace=True)
train_1 = train_1.drop('reordered', axis = 1)
train_reordered = pd.merge(train_1, train_0, how = 'outer', on = 'order_id')
train_reordered.fillna({'reordered_1':0, 'reordered_0':0}, inplace=True)
del train_0
del train_1


## get information about the number line, number line by reordered of each order on eval_set = prior
prior_0 = order_products_prior[order_products_prior.reordered == 0].groupby(['order_id','reordered'])['product_id'].count().reset_index()
prior_1 = order_products_prior[(order_products_prior.reordered == 1)].groupby(['order_id','reordered'])['product_id'].count().reset_index()
prior_0.rename(columns={'product_id':'reordered_0'}, inplace=True)
prior_0 = prior_0.drop('reordered', axis = 1)
prior_1.rename(columns={'product_id':'reordered_1'}, inplace=True)
prior_1 = prior_1.drop('reordered', axis = 1)
prior_reordered = pd.merge(prior_1, prior_0, how = 'outer', on = 'order_id')
prior_reordered.fillna({'reordered_1':0, 'reordered_0':0}, inplace=True)
del prior_0
del prior_1

## concat two datafarm: train and prior

df_reordered = pd.concat([prior_reordered,train_reordered])
df_reordered = df_reordered.sort_values(by = 'order_id', ascending= True).reset_index()
df_reordered = df_reordered.drop('index', axis = 1)

## deal with missing value of the column reordered_0
df_reordered.fillna({'reordered_1':0, 'reordered_0':0}, inplace=True)

## get information user_id from the Order Dataset

df_reordered = df_reordered.merge(order[['user_id','order_id','order_number','days_since_prior_order']], how = 'left', on = 'order_id')

df_reordered['total_line'] = df_reordered.reordered_0 + df_reordered.reordered_1


## create a datafarm about order detail

cus_orderdetail_df = df_reordered.groupby(['user_id']).agg({
    'reordered_0': 'sum',
    'reordered_1': 'sum',
    'total_line' :'sum',
    'order_number': 'count',
    'days_since_prior_order': 'mean'
}).reset_index()

cus_orderdetail_df.rename(columns={'days_since_prior_order':'recency','order_number': 'fequency'}, inplace=True)

In [None]:
cus_orderdetail_df.describe().T

In [None]:
def QQ_plot(data, measure):
    fig = plt.figure(figsize=(20,7))

    #Get the fitted parameters used by the function
    (mu, sigma) = norm.fit(data[measure])
    
    sns.set(style='darkgrid', font_scale=1.0)

    #Kernel Density plot
    fig1 = fig.add_subplot(121)
    sns.distplot(data[measure], fit=norm)
    fig1.set_title(measure + ' Distribution ( mu = {:.2f} and sigma = {:.2f} )'.format(mu, sigma), loc='center')
    fig1.set_xlabel(measure)
    fig1.set_ylabel('Frequency')

    #QQ plot
    fig2 = fig.add_subplot(122)
    res = probplot(data[measure], plot=fig2)
    fig2.set_title(measure + ' Probability Plot (skewness: {:.6f} and kurtosis: {:.6f} )'.format(data[measure].skew(), data[measure].kurt()), loc='center')

    plt.tight_layout()
    plt.show()

In [None]:
for i in cus_orderdetail_df.iloc[1,1:].index:
    QQ_plot(cus_orderdetail_df, i)

In [None]:
# cus_order_df = order.groupby(['user_id']).agg({
#     'order_number': 'count',
#     'days_since_prior_order': ['min','mean','median', 'max']
# })

# cus_order_df.columns = [ ' '.join(str(i) for i in col) for col in cus_order_df.columns]
# cus_order_df = cus_order_df.reset_index()

## 2. Data Processing

In [None]:
error = 0.0001
cus_orderdetail_df['fequency_log'] = np.log(cus_orderdetail_df['fequency'])
cus_orderdetail_df['reordered_0_log'] = np.log(cus_orderdetail_df['reordered_0'])
cus_orderdetail_df['total_line_log'] = np.log(cus_orderdetail_df['total_line'])
cus_orderdetail_df['recency_log'] = np.log(cus_orderdetail_df['recency']+ error)
cus_orderdetail_df['reordered_1_log'] = np.log(cus_orderdetail_df['reordered_1']+ error)
feature_vector = ['fequency_log','recency_log','reordered_0_log','reordered_1_log','total_line_log']
X_subset = cus_orderdetail_df[feature_vector]
scaler = StandardScaler()
X_subset[feature_vector] = scaler.fit_transform(X_subset[feature_vector])

In [None]:
cl = 10
corte = 0.1

anterior = 100000000000000
cost = [] 
K_best = cl

for k in range (1, cl+1):
    # Create a kmeans model on our data, using k clusters.  random_state helps ensure that the algorithm returns the same results each time.
    model = KMeans(
        n_clusters=k, 
        init='k-means++', #'random',
        n_init=10,
        max_iter=300,
        tol=1e-04,
        random_state=101)

    model = model.fit(X_subset)

    # These are our fitted labels for clusters -- the first cluster has label 0, and the second has label 1.
    labels = model.labels_
 
    # Sum of distances of samples to their closest cluster center
    interia = model.inertia_
    if (K_best == cl) and (((anterior - interia)/anterior) < corte): K_best = k - 1
    cost.append(interia)
    anterior = interia

plt.figure(figsize=(8, 6))
plt.scatter(range (1, cl+1), cost, c='red')
plt.show()

# Create a kmeans model with the best K.
print('The best K sugesst: ',K_best)
model = KMeans(n_clusters=K_best, init='k-means++', n_init=10,max_iter=300, tol=1e-04, random_state=101)

# Note I'm scaling the data to normalize it! Important for good results.
model = model.fit(X_subset)

# These are our fitted labels for clusters -- the first cluster has label 0, and the second has label 1.
labels = model.labels_

# And we'll visualize it:
fig = plt.figure(figsize=(20,5))
ax = fig.add_subplot(121)
plt.scatter(x = X_subset.iloc[:,1], y = X_subset.iloc[:,0], c=model.labels_.astype(float))
ax.set_xlabel(feature_vector[1])
ax.set_ylabel(feature_vector[0])

plt.show()

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score

# A list holds the silhouette coefficients for each k
silhouette_coefficients = []

# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10,max_iter=300, tol=1e-04, random_state=101)
    kmeans.fit(X_subset)
    score = silhouette_score(X_subset, kmeans.labels_)
    silhouette_coefficients.append(score)

In [None]:
plt.figure(figsize = (15,6))
plt.plot(range(2, 11), silhouette_coefficients)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
cluster_centers = dict()

cluster_num = [2, 5, 8]
for n_clusters in cluster_num:

    clusterer = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10,max_iter=300, tol=1e-04, random_state=101)
    cluster_labels = clusterer.fit_predict(X_subset)
    silhouette_avg = silhouette_score(X = X_subset, labels = cluster_labels)
    cluster_centers.update({n_clusters :{'cluster_center':clusterer.cluster_centers_,
                                         'silhouette_score':silhouette_avg,
                                         'labels':cluster_labels}
                           })

    sample_silhouette_values = silhouette_samples(X = X_subset, labels = cluster_labels)

In [None]:
## get the data of cluster center
cent_transformed = scaler.inverse_transform(cluster_centers[2]['cluster_center'])
cluster_2 = pd.DataFrame(np.exp(cent_transformed),columns=features)
cent_transformed = scaler.inverse_transform(cluster_centers[5]['cluster_center'])
cluster_5 = pd.DataFrame(np.exp(cent_transformed),columns=features)
cent_transformed = scaler.inverse_transform(cluster_centers[8]['cluster_center'])
cluster_8 = pd.DataFrame(np.exp(cent_transformed),columns=features)

## add the column Number Cluster

cluster_2['Number_Cluster'] = 'Cluster_2'
cluster_5['Number_Cluster'] = 'Cluster_5'
cluster_8['Number_Cluster'] = 'Cluster_8'

## reset index
cluster_2 = cluster_2.reset_index()
cluster_5 = cluster_5.reset_index()
cluster_8 = cluster_8.reset_index()

## concat the datafarms

cluster_center = pd.concat([cluster_2, cluster_5, cluster_8])
cluster_center.rename(columns={'index':'Cluster'}, inplace=True)

In [None]:
cluster_center.to_csv("cluster_center.csv", index = False, header = True)

In [None]:
features = ['fequency','recency','reordered_0','reordered_1','total_line']
for i in cluster_num:
    print("for {} clusters the silhouette score is {:1.2f}".format(i, cluster_centers[i]['silhouette_score']))
    print("Centers of each cluster:")
    cent_transformed = scaler.inverse_transform(cluster_centers[i]['cluster_center'])
    print(pd.DataFrame(np.exp(cent_transformed),columns=features))
    print('-'*50)

In [None]:
cus_orderdetail_df['clusters_2'] = cluster_centers[2]['labels'] 
cus_orderdetail_df['clusters_5'] = cluster_centers[5]['labels']
cus_orderdetail_df['clusters_8'] = cluster_centers[8]['labels']
display(cus_orderdetail_df.head())

In [None]:
cus_orderdetail_df

In [None]:
"""
The list style using for matplotlib pyplot

['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 
'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 
'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 
'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 
'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']



"""

plt.style.use(['fivethirtyeight', 'bmh'])
fig = plt.figure(figsize=(20,7))
f1 = fig.add_subplot(131)
market = cus_orderdetail_df.clusters_2.value_counts()
g = plt.pie(market, labels=market.index, autopct='%1.1f%%', shadow=True, startangle=90)
plt.title('2 Clusters')
f1 = fig.add_subplot(132)
market = cus_orderdetail_df.clusters_5.value_counts()
g = plt.pie(market, labels=market.index, autopct='%1.1f%%', shadow=True, startangle=90)
plt.title('5 Clusters')
f1 = fig.add_subplot(133)
market = cus_orderdetail_df.clusters_8.value_counts()
g = plt.pie(market, labels=market.index, autopct='%1.1f%%', shadow=True, startangle=90)
plt.title('8 Clusters')
plt.show()

In [None]:
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode()
import matplotlib.mlab as mlab
import matplotlib.cm as cm

In [None]:
x_data = ['Cluster 0', 'Cluster 1','Cluster 2','Cluster 3','Cluster 4', 'Cluster 5', 'Cluster 6','Cluster 7']
colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 
          'rgba(22, 80, 57, 0.5)', 'rgba(127, 65, 14, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']

features = ['fequency','recency','reordered_0','reordered_1','total_line']
cutoff_quantile = 95


def box_visual(n_clusters):
    cl = 'clusters_' + str(n_clusters)
    for fild in range(0, len(features)):
        field_to_plot = features[fild]        
        y_data = list()
        ymax = 0
        for i in np.arange(0,n_clusters):
            y0 = cus_orderdetail_df[cus_orderdetail_df[cl]==i][field_to_plot].values
            y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
            if ymax < max(y0): ymax = max(y0)
            y_data.insert(i, y0)

        traces = []

        for xd, yd, cls in zip(x_data[:n_clusters], y_data, colors[:n_clusters]):
                traces.append(go.Box(y=yd, name=xd, boxpoints=False, jitter=0.5, whiskerwidth=0.2, fillcolor=cls,
                    marker=dict( size=1, ),
                    line=dict(width=1),
                ))

        layout = go.Layout(
            title='Difference in {} with {} Clusters and {:1.2f} Score'.\
            format(field_to_plot, n_clusters, cluster_centers[n_clusters]['silhouette_score']),
            yaxis=dict( autorange=True, showgrid=True, zeroline=True,
                dtick = int(ymax/10),
                gridcolor='black', gridwidth=0.1, zerolinecolor='rgb(255, 255, 255)', zerolinewidth=2, ),
            margin=dict(l=40, r=30, b=50, t=50, ),
            paper_bgcolor='white',
            plot_bgcolor='white',
            showlegend=False
        )

        fig = go.Figure(data=traces, layout=layout)
        fig.show()

In [None]:
box_visual(2)