In [34]:
import numpy as np
import pandas as pd

df = pd.read_csv('Cleaned-P&P data.csv')
df.head()

Unnamed: 0,class,desc,inc_num,short_des
0,P,"good afternoon , \r\n\r\n this customer charge...",INC0809260,y2684992
1,R,please remove delivery pass removed attached s...,INC0808628,delivery pass needs removing on staff accounts...
2,O,set ( acustrss ) 1 account holder receive stat...,INC0807303,g7121564
3,R,please delivery pass removed attached staff ac...,INC0807253,delivery pass needs removing on staff accounts...
4,P,customer : d4957244 \r\n website : jd williams...,INC0806836,customer being charged for delivery


In [4]:
blanks = []  # start with an empty list`

for i,c,d,inc,sd in df.itertuples():  # iterate over the DataFrame
        if c == 'O' or c=='E':         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
df.drop(blanks, inplace=True)

In [5]:
df["desc"] = df["short_des"] + '. ' + df["desc"]
len(df)

244

#### Pre-processing

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2)

In [8]:
dtm = tfidf.fit_transform(df['desc'])

#### NMF

In [9]:
from sklearn.decomposition import NMF

In [10]:
nmf_model = NMF(n_components=2,random_state=42)

In [11]:
# This can take awhile, we're dealing with a large amount of documents!
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=2, random_state=42, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [12]:
len(tfidf.get_feature_names())

375

In [13]:
import random
for i in range(374):
    print(tfidf.get_feature_names()[i])

01
02
03
04
05
06
07
08
09
10
10th
11
12
12th
139
14
14th
15th
16
161236
17
17th
18
19
1st
2017
2018
2019
2020
20th
22
23
236123
23rd
24
24th
25
27
27th
28
28th
29
29th
30th
3506
3521
3544
3553
3594
3603
3620
3643
3651
3657
3806
3829
384a
3913
3rd
44
4th
50
8256
95
99
995
9th
able
account
accountno
accounts
accountwith
activate
activated
activating
active
add
added
adding
advance
advise
advisor
affected
after
afternoon
ahmed
all
ambrose
amount
angela
anita
annual
applicable
application
applied
apply
april
arron
assist
attached
august
based
be
being
bought
brands
brown
browser
but
buying
called
can
cancel
cancellation
cancelled
care
catherine
cc
charge
charged
charges
charging
check
chris
citrix
claire
clothing
co
code
coles
com
connolly
contact
contactno
country
cust
customer
customerid
customerno
customers
date
day
dba
dec
del
deliver
deliveries
description
despite
details
diagnosis
digital
edwards
eligible
error
errormessage
even
everytime
expiry
ext
fashion
fault
feb
february
fg
fix

In [14]:
len(nmf_model.components_)

2

In [15]:
single_topic = nmf_model.components_[0]
single_topic.argsort()

array([172, 179, 180, 184, 188, 191, 192, 193, 194, 196, 197, 198, 199,
       201, 178, 202, 204, 205, 206, 207, 211, 217, 218, 222, 224, 226,
       227, 228, 229, 203, 177, 174, 173, 134, 136, 137, 138, 140, 141,
       367, 143, 147, 148, 149, 150, 152, 154, 155, 156, 157, 158, 159,
       160, 161, 162, 163, 165, 365, 168, 170, 363, 344, 230, 231, 232,
       233, 293, 294, 302, 304, 307, 308, 309, 312, 313, 314, 315, 316,
       318, 319, 320, 350, 349, 324, 325, 326, 329, 330, 331, 332, 334,
       347, 340, 341, 343, 353, 133, 354, 287, 234, 361, 360, 240, 243,
       245, 246, 247, 359, 250, 251, 254, 255, 358, 258, 261, 262, 263,
       264, 266, 267, 268, 269, 355, 273, 274, 275, 279, 281, 288, 132,
       345,  21,  64,  65,  66,  67, 372, 371,  27,  71,  72,  63,  26,
        78,  80,  82,  83,  20,  88,  89,  17,  93,  24,  94,  61,  58,
        35,  36,  37,  38,  33,  39,  40,  42,  43, 130,  44,  47,  49,
        50,  51,  53,  54,  55,  57,  28,  46,  13,  34, 110, 11

In [16]:
# Word least representative of this topic
single_topic[363]

0.0

In [17]:
top_word_indices = single_topic.argsort()[-10:]

In [18]:
for index in top_word_indices:
    print(tfidf.get_feature_names()[index])

orders
on
needs
req
dba
supra
removing
pass
accounts
staff


In [19]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['thank', 'thanks', 'please', 'removed', 'attached', 'orders', 'on', 'needs', 'req', 'dba', 'supra', 'removing', 'pass', 'accounts', 'staff']


THE TOP 15 WORDS FOR TOPIC #1
['website', 'account', 'being', 'issue', 'free', 'order', 'pass', 'getting', 'not', 'working', 'number', 'for', 'charged', 'subscription', 'customer']




In [20]:
dtm

<244x375 sparse matrix of type '<class 'numpy.float64'>'
	with 4454 stored elements in Compressed Sparse Row format>

In [21]:
dtm.shape

(244, 375)

In [22]:
topic_results = nmf_model.transform(dtm)

In [23]:
topic_results.shape

(244, 2)

In [24]:
topic_results[11]

array([0.04586123, 0.06781565])

In [25]:
df['Topic'] = topic_results.argmax(axis=1)

In [26]:
df.head(10)

Unnamed: 0,class,desc,inc_num,short_des,Topic
0,P,"y2684992. y2684992. good afternoon , \r\n\r\n ...",INC0809260,y2684992,1
1,R,delivery pass needs removing on staff accounts...,INC0808628,delivery pass needs removing on staff accounts...,0
3,R,delivery pass needs removing on staff accounts...,INC0807253,delivery pass needs removing on staff accounts...,0
4,P,customer being charged for delivery. customer ...,INC0806836,customer being charged for delivery,1
5,P,website/web shop issues. website/web shop issu...,INC0805994,website/web shop issues,1
6,P,delivery passes. delivery passes. hi \r\n\r\n ...,INC0805875,delivery passes,0
7,R,delivery pass needs removing on staff account ...,INC0803134,delivery pass needs removing on staff account ...,0
8,R,delivery pass needs removing on staff accounts...,INC0798911,delivery pass needs removing on staff accounts...,0
9,P,customer being charged for delivery. customer ...,INC0793541,customer being charged for delivery,1
10,R,delivery pass needs removing on orders supra d...,INC0793103,delivery pass needs removing on orders supra d...,0


In [27]:
tpdict = {1:'Add Delivery pass' , 0:'Remove Delivery Pass'}
df['Topic Label'] = df['Topic'].map(tpdict)
cldict = {'P':1 , 'R':0}
df['Class Label'] = df['class'].map(cldict)

In [28]:
df

Unnamed: 0,class,desc,inc_num,short_des,Topic,Topic Label,Class Label
0,P,"y2684992. y2684992. good afternoon , \r\n\r\n ...",INC0809260,y2684992,1,Add Delivery pass,1
1,R,delivery pass needs removing on staff accounts...,INC0808628,delivery pass needs removing on staff accounts...,0,Remove Delivery Pass,0
3,R,delivery pass needs removing on staff accounts...,INC0807253,delivery pass needs removing on staff accounts...,0,Remove Delivery Pass,0
4,P,customer being charged for delivery. customer ...,INC0806836,customer being charged for delivery,1,Add Delivery pass,1
5,P,website/web shop issues. website/web shop issu...,INC0805994,website/web shop issues,1,Add Delivery pass,1
6,P,delivery passes. delivery passes. hi \r\n\r\n ...,INC0805875,delivery passes,0,Remove Delivery Pass,1
7,R,delivery pass needs removing on staff account ...,INC0803134,delivery pass needs removing on staff account ...,0,Remove Delivery Pass,0
8,R,delivery pass needs removing on staff accounts...,INC0798911,delivery pass needs removing on staff accounts...,0,Remove Delivery Pass,0
9,P,customer being charged for delivery. customer ...,INC0793541,customer being charged for delivery,1,Add Delivery pass,1
10,R,delivery pass needs removing on orders supra d...,INC0793103,delivery pass needs removing on orders supra d...,0,Remove Delivery Pass,0


In [29]:
for n,c,d,i,s,t,l,cl in df.itertuples():
    if t!=cl:
        print(i, c, n )

INC0805875 P 6
INC0791140 R 12
INC0770868 R 41
INC0699015 P 115
INC0685865 R 126
INC0667087 P 154
INC0647183 R 177
INC0630640 R 186
INC0628447 P 187
INC0628144 R 188
INC0625595 R 191
INC0624883 R 192
INC0624810 R 193
INC0622781 R 197
INC0622283 R 199
INC0622277 R 200
INC0622190 R 202
INC0612636 P 208
INC0597850 R 217
INC0578947 R 229
INC0529468 R 244


In [32]:
print(df.loc[[197]])

    class                                               desc     inc_num  \
197     R  delivery pass cancellation. delivery pass canc...  INC0622781   

                      short_des  Topic        Topic Label  Class Label  
197  delivery pass cancellation      1  Add Delivery pass            0  


In [33]:
df['desc'][197]

'delivery pass cancellation. delivery pass cancellation. s1259052 - please delivery subscription service removed?thanks !'