In [1]:
import pandas as pd
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize ## sent_tokenize,
import matplotlib.pyplot as plt
from nltk.corpus import stopwords       
import plotly.graph_objects as go

In [2]:
##nltk.download('stopwords')

# Connection to MongoDB

## Parameters

In [3]:
dbHost = 'localhost'
dbPort = '27017'

dbURL = "mongodb://{0}:{1}/".format(dbHost,dbPort )
dbName = "Retail"


## Connector

In [4]:
### Connection to datbase

# Making a Connection with MongoClient
client = MongoClient(dbURL)
# database
db = client[dbName]
# collection
productCollection= db["Product"]
transactionCollection = db["Transaction"]

# Using jupyter notebook or another similar solution, try to qualify the products in order to create relevant groups based on their name and visualize them in order to show which group made the most transactions.

## Extract Data

In [5]:
pipeline = [
    {
        '$match': {
            'StockCode': {
                '$nin': [
                    'post', 'd', 'dot', 'm', 'bank charges', 'amazonfee', 'b', 'cruk', 's'
                ]
            }
        }
    },
    {
        '$project': {
            '_id': 0
        }
    }
]

dfProduct = pd.DataFrame(list(productCollection.aggregate(pipeline)))

## Data Pre-processing

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

def preProcessing(text):
    tokens =nltk.word_tokenize(text)
    unique_tokens = list(set(tokens)) ### remove duplicate words in description
    cleaned_token = []
    for word in unique_tokens: ### Remove stop word
        if word not in stop_words:
            cleaned_token.append(word)
    ## tagged = nltk.pos_tag(cleaned_token)
    lemma_tokens = map(lemmatizer.lemmatize, tokens) ##  lemmatize the words 
    return ' '.join(list(lemma_tokens))

In [7]:
dfProduct['Description'] = dfProduct['Description'].apply(lambda x : preProcessing(x))

## TF-IDF

In [8]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(dfProduct['Description'])

## Elbow Method for estimate the number of cluster

In [22]:
"""
%%time
# Elbow Method for K means# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer
model = KMeans(init='k-means++', max_iter=100, n_init=10)
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,30), timings= True)
visualizer.fit(X)        # Fit data to visualizer
visualizer.show()        # Finalize and render figure'''
"""

"\n%%time\n# Elbow Method for K means# Import ElbowVisualizer\nfrom yellowbrick.cluster import KElbowVisualizer\nmodel = KMeans(init='k-means++', max_iter=100, n_init=10)\n# k is range of number of clusters.\nvisualizer = KElbowVisualizer(model, k=(2,30), timings= True)\nvisualizer.fit(X)        # Fit data to visualizer\nvisualizer.show()        # Finalize and render figure'''\n"

## K-MEANS

### Model

In [10]:
true_k = 21
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=10)
model.fit(X)

KMeans(max_iter=100, n_clusters=21)

### Centroids

In [11]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

##  Number of transactions per group

In [12]:
def categorize(text):
    clean_text = preProcessing(text)
    y = vectorizer.transform([clean_text])
    prediction = model.predict(y)
    return prediction[0]

In [13]:
dfProduct['Category'] = dfProduct['Description'].apply(lambda x :categorize(x) ) ### Categorize each product

### Extract Transaction data

In [14]:
pipeline = [
    {
        '$match': {
            'StockCode': {
                '$nin': [
                    'post', 'd', 'dot', 'm', 'bank charges', 'amazonfee', 'b', 'cruk', 's','gift_0001_50',
                    'gift_0001_40','gift_0001_30','gift_0001_20','gift_0001_10',
                ]
            }
        }
    }, {
        '$group': {
            '_id': '$StockCode', 
            'Quantity': {
                '$sum': '$Quantity'
            }
        }
    }
]

dfTransaction = pd.DataFrame(list(transactionCollection.aggregate(pipeline)))

### Map each transaction with the group

In [15]:
def mapGroup(stockCode, dfProduct):
    return dfProduct[dfProduct['StockCode'] == stockCode]['Category'].values[0]
   
dfTransaction['Category'] = dfTransaction['_id'].apply(lambda x : mapGroup(x, dfProduct) )

### Calculate number of transactions pet category

In [16]:
dfCategory = dfTransaction.groupby('Category').sum().sort_values('Quantity', ascending = False)

In [20]:
fig = go.Figure(data=[go.Pie(labels=dfCategory.index, values=dfCategory['Quantity'])])
fig.update_layout(
    title_text='% of transaction per group', # title of plot
    legend_title_text='Groups' # title of legend
)
fig.show()

In [18]:
bestSellingGroup = dfCategory.index[0]

print("The best selling group is the group {0} ".format(bestSellingGroup))

The best selling group is the group 0 


In [19]:
"""
for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
         print(' %s' % terms[ind])
"""

'\nfor i in range(true_k):\n    print("Cluster %d:" % i)\n    for ind in order_centroids[i, :10]:\n         print(\' %s\' % terms[ind])\n'