In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram

In [None]:
df = pd.read_csv('/kaggle/input/ecommerce-data/data.csv',encoding='ISO-8859-1')

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Sales'] = df['Quantity']*df['UnitPrice']
df.head()

## Preliminary analysis


In [None]:
print('Total number of customers is:', df['CustomerID'].nunique())
print('Total number of transactions that occured is:', df['InvoiceNo'].nunique())
print('Total number of products sold is:', df['StockCode'].nunique())

In [None]:
df.info()

## Orders by Country

In [None]:
print('Orders came from ',df['Country'].nunique(),'different countries.')

In [None]:
customers_by_countries = df.copy()
customers_by_countries.drop_duplicates(subset=['CustomerID'],inplace = True)
countries = customers_by_countries['Country'].value_counts()
countries

We can see that maximum orders came from United Kingdom

## Monthly Sales analysis

In [None]:
df['monthly sales'] = pd.to_datetime(df['InvoiceDate']).dt.strftime('%Y-%m')
monthly_sales = df.groupby(['monthly sales'],as_index = False).agg({'Sales':'sum'})
monthly_sales

In [None]:
sns.lineplot(monthly_sales['monthly sales'],monthly_sales['Sales'])
sns.scatterplot(monthly_sales['monthly sales'],monthly_sales['Sales'])
plt.xticks(rotation = 45);

Sales increased and decreased for almost every month in the beginning, but since June,2011 the sales gradually starting rising until the maximum sales took place in the month November-2011 (2011-11).

## Grouping customers


In [None]:
customers = df.groupby(['CustomerID'],as_index = False).agg({'Sales':'sum','InvoiceNo':'count','Country':'first'})
customers

In [None]:
encoder = LabelEncoder()
customers['Country'] = encoder.fit_transform(customers['Country'])

In [None]:
X = customers[['Sales','InvoiceNo','Country']]
inertias = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i,random_state=0)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

print(pd.DataFrame(inertias,columns=['inertia_values']))

In [None]:
sns.set_style('darkgrid')
sns.lineplot(range(1,11),inertias)
plt.xlabel('Number of clusters')
plt.ylabel('inertia values')
plt.title('Number of clusters vs inertia values')

Optimum cluster number can be seen to be 3

In [None]:
kmeans = KMeans(n_clusters=3,random_state=0)
kmeans.fit(X)

In [None]:
Z = linkage(X, method='ward',metric='euclidean')
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(Z,truncate_mode='lastp',p=12,show_leaf_counts=False,  
            leaf_rotation=90.,leaf_font_size=12.,show_contracted=True,)
plt.show()