In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/ecommerce-data/data.csv', encoding = "ISO-8859-1")

In [None]:
data.head()

# Data Exploration

### Invoice Number

In [None]:
data['No'] = data['InvoiceNo'].str[0]

In [None]:
data['No'].value_counts()

In [None]:
data[data['No'] == 'C'].head(10)

In [None]:
data[data['No'] == 'C'].Quantity.describe()

In [None]:
# Products with start with C are returned articles

In [None]:
data[data['No'] == 'A']

In [None]:
# Bad Operation ?

### Stock Code

In [None]:
data['Code'] = data['StockCode'].str[0]

In [None]:
data['Code'].value_counts()

In [None]:
# Understanding letters code

In [None]:
data[data['Code'] == 'P'].Description.value_counts()

In [None]:
# P : POSTAGE Products

In [None]:
data[data['Code'] == 'D'].Description.value_counts()

In [None]:
data[(data['Code'] == 'D') & (data['StockCode'] != 'D') & (data['StockCode'] != 'DOT')]

In [None]:
# DCGS = Discount

In [None]:
data[data['Code'] == 'C'].Description.value_counts()

In [None]:
data[data['Code'] == 'C'].Country.value_counts()

In [None]:
data[data['Code'] == 'M'].Description.value_counts()

In [None]:
data[data['Code'] == 'B'].Description.value_counts()

In [None]:
data[data['Code'] == 'S'].Description.value_counts()

In [None]:
data[data['Code'] == 'A'].Description.value_counts()

In [None]:
data[data['Code'] == 'g'].Description.value_counts()

In [None]:
# G = Gift

In [None]:
data[data['Code'] == 'm'].Description.value_counts()

In [None]:
# Change this stock code
data.loc[data.Code == 'm', 'Code'] = 'M'

### Unit Price

In [None]:
data.UnitPrice.describe()

In [None]:
# Understanding negative values
data[data.UnitPrice < 0]

In [None]:
data[data.UnitPrice >= 0].UnitPrice.describe()

In [None]:
# Some articles are free (Unit Price = 0)

In [None]:
# Distribution of prices < 10
plt.figure(figsize=(10,6))
sns.distplot(data[(data.UnitPrice < 10) & (data.UnitPrice >= 0)].UnitPrice, hist = False)

### Quantity

In [None]:
data.Quantity.describe()

In [None]:
# Distribution of quantity
plt.figure(figsize=(10,6))
sns.distplot(data[data.Quantity > 0].Quantity, hist = False)

### Countries

In [None]:
data.Country.value_counts()[:10]

### Total Price

In [None]:
# Negative quantities as positive
data['Quantity'] = abs(data['Quantity'])

In [None]:
data['TotalPrice'] = data['Quantity'] * data['UnitPrice']

In [None]:
# Distribution of quantity
plt.figure(figsize=(10,6))
sns.distplot(data[data.TotalPrice > 0].TotalPrice, hist = False)

### Invoice Date

In [None]:
data['Date'] = pd.to_datetime(data['InvoiceDate'])

In [None]:
data['Date'].head(10)

In [None]:
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day
data['hour'] = data['Date'].dt.hour

In [None]:
data.year.value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data.month)

In [None]:
# Majority in November and December (for Christmas)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data.day)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data.hour)

# Data Featuring

In [None]:
# check missing values for each column 
data.isnull().sum().sort_values(ascending=False)

# Products Classification

In [None]:
prod = data[~data.Description.isnull()]

In [None]:
prod['Description'].head(10)

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(max_words=1000,margin=0).generate(' '.join(prod['Description']))
plt.figure(figsize = (15, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# fill description with nan

prod['text'] = prod['Description'].fillna('')

In [None]:
# lower description

prod['text'] = prod['text'].str.lower()

In [None]:
# stopwords

from nltk.corpus import stopwords
stop = stopwords.words('english')

prod['text'] = prod['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(max_words=1000,margin=0).generate(' '.join(prod['text']))
plt.figure(figsize = (15, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(prod['text'])

In [None]:
# Calculate sum of squared distances
ssd = []
K = range(1,10)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X)
    ssd.append(km.inertia_)

In [None]:
# Plot sum of squared distances / elbow method
plt.figure(figsize=(10,6))
plt.plot(K, ssd, 'bx-')
plt.xlabel('k')
plt.ylabel('ssd')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
# Best number of clusters is 5

In [None]:
# Create and fit model
kmeans = KMeans(n_clusters=5)
model = kmeans.fit(X)

In [None]:
pred = model.labels_
prod['Cluster_prod'] = pred

In [None]:
prod = prod[['Description', 'text', 'Cluster_prod']]

In [None]:
prod.head()

In [None]:
fig = plt.figure(figsize = (20, 15))
for c in range(len(prod['Cluster_prod'].unique())):
    ax = fig.add_subplot(3,2,c+1)
    ax.set_title('Cluster %d'%c)
    cluster = prod[prod.Cluster_prod == c]
    wordcloud = WordCloud( max_words=1000,margin=0).generate(' '.join(cluster['text']))
    ax.imshow(wordcloud)
    ax.axis("off")

In [None]:
# Cluster A : Bags
# Cluster B : Signs
# Cluster C : Kitchen
# Cluster D : Decoration
# Cluster E : Retrospots

# Customers Classification

In [None]:
data.columns

In [None]:
cus = data[['CustomerID', 'InvoiceDate', 'Quantity', 'UnitPrice', 'Country', 'TotalPrice']]

In [None]:
cus = cus[~cus.CustomerID.isnull()]

### Total products bought

In [None]:
cus_prod = cus.groupby('CustomerID')['Quantity'].sum().reset_index()
cus_prod.columns = ['CustomerID', 'TotalProducts']

In [None]:
cus = cus.merge(cus_prod, on='CustomerID')

### Number of transactions (1 per day)

In [None]:
cus['InvoiceDate'] = cus['InvoiceDate'].str.split(' ').str[0]

In [None]:
transactions = cus[['CustomerID', 'InvoiceDate']].drop_duplicates()

In [None]:
transactions = transactions.groupby('CustomerID')['InvoiceDate'].count().reset_index()
transactions.columns = ['CustomerID', 'Transactions']

In [None]:
cus = cus.merge(transactions, on='CustomerID')

In [None]:
cus = cus.drop(['Quantity', 'UnitPrice', 'InvoiceDate'], axis=1)

In [None]:
# drop duplicates
cus = cus.drop_duplicates()

In [None]:
len(cus)

In [None]:
cus.head()

In [None]:
# Label encoder
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
cus['Country'] = le.fit_transform(cus.Country.values)

In [None]:
# cus['No'] = le.fit_transform(cus.No.values)
# cus['Code'] = le.fit_transform(cus.Code.values)

In [None]:
# Calculate sum of squared distances
ssd = []
K = range(1,10)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(cus)
    ssd.append(km.inertia_)

In [None]:
# Plot sum of squared distances / elbow method
plt.figure(figsize=(10,6))
plt.plot(K, ssd, 'bx-')
plt.xlabel('k')
plt.ylabel('ssd')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
# Best number of clusters is 3

In [None]:
# Create and fit model
kmeans = KMeans(n_clusters=3)
model = kmeans.fit(cus)

In [None]:
pred = model.labels_
cus['Cluster_cus'] = pred

In [None]:
cus.head()

In [None]:
# Create PCA for data visualization / Dimensionality reduction to 2D graph
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_model = pca.fit_transform(cus)
cus_transform = pd.DataFrame(data = pca_model, columns = ['PCA1', 'PCA2'])
cus_transform['Cluster_cus'] = pred

In [None]:
plt.figure(figsize=(10,10))
g = sns.scatterplot(data=cus_transform, x='PCA1', y='PCA2', palette=sns.color_palette()[:3], hue='Cluster_cus')
title = plt.title('Personality Clusters with PCA')

In [None]:
customers = cus.groupby('Cluster_cus').mean()
customers = customers.reset_index()

In [None]:
customers.columns

In [None]:
customers[['Cluster_cus', 'CustomerID', 'Country', 'TotalPrice', 'TotalProducts', 'Transactions']]

In [None]:
# Cluster A : From UK : few transactions and products, small total price
# Cluster B : Regular with a good amount of products (weekly shopping)
# Cluster C : Big amount of products, maybe for stock

# Let's look at the products for these clusters

In [None]:
prod_cust = data.merge(cus[['Cluster_cus', 'CustomerID']], on='CustomerID')

In [None]:
prod_cust = prod_cust.drop_duplicates()

In [None]:
prod_cust['text'] = prod_cust['Description'].fillna('')
prod_cust['text'] = prod_cust['text'].str.lower()
prod_cust['text'] = prod_cust['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
fig = plt.figure(figsize = (20, 15))
for c in range(len(prod_cust['Cluster_cus'].unique())):
    ax = fig.add_subplot(3,1,c+1)
    ax.set_title('Cluster %d'%c)
    cluster = prod_cust[prod_cust.Cluster_cus == c]
    wordcloud = WordCloud(max_words=1000,margin=0).generate(' '.join(cluster['text']))
    ax.imshow(wordcloud)
    ax.axis("off")