In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import pandas as pd
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import spacy
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/SaswatiS/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/SaswatiS/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df=pd.read_csv('ticketmaster.csv')
print(df.shape)
df.head()

(9964, 15)


Unnamed: 0,line_num,title,createdTime,status,priority,extId,user.identity.name,user.identity.displayName,user.emailId,user.department,extId.1,integrationId,user.identity.name.1,user.identity.displayName.1,user.emailId.1
0,0.0,Scan my computer now,1574306000.0,Closed,High,stuart.thomson@acme.com,Stuart Thomson,Stuart Thomson,stuart.thomson@acme.com,HR,gearldine_gellinger@gellinger.com,1.0,Gearldine,Gearldine Gellinger,gearldine_gellinger@gellinger.com
1,1.0,how to find address book,1579539000.0,Closed,Low,peter.macron@acme.com,Peter Macron,Peter Macron,peter.macron@acme.com,Engineering,lnewville@newville.com,1.0,Lenna,Lenna Newville,lnewville@newville.com
2,2.0,Solution 2,1577330000.0,Closed,Medium,victoria.lewis@acme.com,Victoria Lewis,Victoria Lewis,victoria.lewis@acme.com,Legal,virgina_tegarden@tegarden.com,1.0,Virgina,Virgina Tegarden,virgina_tegarden@tegarden.com
3,3.0,real time scanning wont turn on,1575271000.0,Closed,Low,vincent.lee@acme.com,Vincent Lee,Vincent Lee,vincent.lee@acme.com,Culture,tfrankel@aol.com,1.0,Tiera,Tiera Frankel,tfrankel@aol.com
4,4.0,Refund,1588172000.0,Closed,Low,kim.anderson@acme.com,Kim Anderson,Kim Anderson,kim.anderson@acme.com,Product & Services,norah.waymire@gmail.com,1.0,Norah,Norah Waymire,norah.waymire@gmail.com


In [3]:
df.dropna(inplace=True)

In [4]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [5]:
processed_docs = []

for doc in df['title']:
    processed_docs.append(preprocess(doc))
    
print(processed_docs[:2])

[[('Scan', 'JJ'), ('my', 'PRP$'), ('computer', 'NN'), ('now', 'RB')], [('how', 'WRB'), ('to', 'TO'), ('find', 'VB'), ('address', 'JJ'), ('book', 'NN')]]


In [6]:
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
            
    return result

In [7]:
#Preprocess

df['clean_title']= df['title'].apply(lambda x: preprocess(x))

In [8]:
sentences = [x for x in df.clean_title]
print(sentences[1])

['address', 'book']


In [9]:
def apply_tag(index):
    return [(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[int(index)])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

In [10]:
df['tags']= df['line_num'].apply(lambda x: apply_tag(x))
df.head()

Unnamed: 0,line_num,title,createdTime,status,priority,extId,user.identity.name,user.identity.displayName,user.emailId,user.department,extId.1,integrationId,user.identity.name.1,user.identity.displayName.1,user.emailId.1,clean_title,tags
0,0.0,Scan my computer now,1574306000.0,Closed,High,stuart.thomson@acme.com,Stuart Thomson,Stuart Thomson,stuart.thomson@acme.com,HR,gearldine_gellinger@gellinger.com,1.0,Gearldine,Gearldine Gellinger,gearldine_gellinger@gellinger.com,[scan],"[(scan, PROPN, scan)]"
1,1.0,how to find address book,1579539000.0,Closed,Low,peter.macron@acme.com,Peter Macron,Peter Macron,peter.macron@acme.com,Engineering,lnewville@newville.com,1.0,Lenna,Lenna Newville,lnewville@newville.com,"[address, book]","[(address, NOUN, address), (book, NOUN, book)]"
2,2.0,Solution 2,1577330000.0,Closed,Medium,victoria.lewis@acme.com,Victoria Lewis,Victoria Lewis,victoria.lewis@acme.com,Legal,virgina_tegarden@tegarden.com,1.0,Virgina,Virgina Tegarden,virgina_tegarden@tegarden.com,[solution],"[(solution, NOUN, solution)]"
3,3.0,real time scanning wont turn on,1575271000.0,Closed,Low,vincent.lee@acme.com,Vincent Lee,Vincent Lee,vincent.lee@acme.com,Culture,tfrankel@aol.com,1.0,Tiera,Tiera Frankel,tfrankel@aol.com,"[real, time, scanning, wont, turn]","[(real, ADJ, real), (time, NOUN, time), (scann..."
4,4.0,Refund,1588172000.0,Closed,Low,kim.anderson@acme.com,Kim Anderson,Kim Anderson,kim.anderson@acme.com,Product & Services,norah.waymire@gmail.com,1.0,Norah,Norah Waymire,norah.waymire@gmail.com,[refund],"[(refund, NOUN, refund)]"


In [11]:
# df.dropna(inplace=True)

In [12]:
def cleanup_propn(sentence):
    ret_list = []
    for i in sentence:
#         print(i)
        if 'PROPN'in i:
            ret_list.append(i)
            
    return ret_list
        
    



In [13]:
df['PropN_cleanup'] = df['tags'].apply(lambda x: cleanup_propn(x))

df.head()


Unnamed: 0,line_num,title,createdTime,status,priority,extId,user.identity.name,user.identity.displayName,user.emailId,user.department,extId.1,integrationId,user.identity.name.1,user.identity.displayName.1,user.emailId.1,clean_title,tags,PropN_cleanup
0,0.0,Scan my computer now,1574306000.0,Closed,High,stuart.thomson@acme.com,Stuart Thomson,Stuart Thomson,stuart.thomson@acme.com,HR,gearldine_gellinger@gellinger.com,1.0,Gearldine,Gearldine Gellinger,gearldine_gellinger@gellinger.com,[scan],"[(scan, PROPN, scan)]","[(scan, PROPN, scan)]"
1,1.0,how to find address book,1579539000.0,Closed,Low,peter.macron@acme.com,Peter Macron,Peter Macron,peter.macron@acme.com,Engineering,lnewville@newville.com,1.0,Lenna,Lenna Newville,lnewville@newville.com,"[address, book]","[(address, NOUN, address), (book, NOUN, book)]",[]
2,2.0,Solution 2,1577330000.0,Closed,Medium,victoria.lewis@acme.com,Victoria Lewis,Victoria Lewis,victoria.lewis@acme.com,Legal,virgina_tegarden@tegarden.com,1.0,Virgina,Virgina Tegarden,virgina_tegarden@tegarden.com,[solution],"[(solution, NOUN, solution)]",[]
3,3.0,real time scanning wont turn on,1575271000.0,Closed,Low,vincent.lee@acme.com,Vincent Lee,Vincent Lee,vincent.lee@acme.com,Culture,tfrankel@aol.com,1.0,Tiera,Tiera Frankel,tfrankel@aol.com,"[real, time, scanning, wont, turn]","[(real, ADJ, real), (time, NOUN, time), (scann...",[]
4,4.0,Refund,1588172000.0,Closed,Low,kim.anderson@acme.com,Kim Anderson,Kim Anderson,kim.anderson@acme.com,Product & Services,norah.waymire@gmail.com,1.0,Norah,Norah Waymire,norah.waymire@gmail.com,[refund],"[(refund, NOUN, refund)]",[]


In [15]:
len(df[df["PropN_cleanup"].str.len() != 0])

1535

In [17]:
# df[df["PropN_cleanup"].str.len() ==4]

In [18]:
def pick_first_noun(sentence):
    ret_list = []
    for i in sentence:
        
        ret_list.append(i[0])
            
    str1 = ' '.join(ret_list)
    
    return str1

In [19]:
df['PropN_words'] = df['PropN_cleanup'].apply(lambda x: pick_first_noun(x))

df.head()

Unnamed: 0,line_num,title,createdTime,status,priority,extId,user.identity.name,user.identity.displayName,user.emailId,user.department,extId.1,integrationId,user.identity.name.1,user.identity.displayName.1,user.emailId.1,clean_title,tags,PropN_cleanup,PropN_words
0,0.0,Scan my computer now,1574306000.0,Closed,High,stuart.thomson@acme.com,Stuart Thomson,Stuart Thomson,stuart.thomson@acme.com,HR,gearldine_gellinger@gellinger.com,1.0,Gearldine,Gearldine Gellinger,gearldine_gellinger@gellinger.com,[scan],"[(scan, PROPN, scan)]","[(scan, PROPN, scan)]",scan
1,1.0,how to find address book,1579539000.0,Closed,Low,peter.macron@acme.com,Peter Macron,Peter Macron,peter.macron@acme.com,Engineering,lnewville@newville.com,1.0,Lenna,Lenna Newville,lnewville@newville.com,"[address, book]","[(address, NOUN, address), (book, NOUN, book)]",[],
2,2.0,Solution 2,1577330000.0,Closed,Medium,victoria.lewis@acme.com,Victoria Lewis,Victoria Lewis,victoria.lewis@acme.com,Legal,virgina_tegarden@tegarden.com,1.0,Virgina,Virgina Tegarden,virgina_tegarden@tegarden.com,[solution],"[(solution, NOUN, solution)]",[],
3,3.0,real time scanning wont turn on,1575271000.0,Closed,Low,vincent.lee@acme.com,Vincent Lee,Vincent Lee,vincent.lee@acme.com,Culture,tfrankel@aol.com,1.0,Tiera,Tiera Frankel,tfrankel@aol.com,"[real, time, scanning, wont, turn]","[(real, ADJ, real), (time, NOUN, time), (scann...",[],
4,4.0,Refund,1588172000.0,Closed,Low,kim.anderson@acme.com,Kim Anderson,Kim Anderson,kim.anderson@acme.com,Product & Services,norah.waymire@gmail.com,1.0,Norah,Norah Waymire,norah.waymire@gmail.com,[refund],"[(refund, NOUN, refund)]",[],


In [30]:
#Top proper nouns identified

df.PropN_words.str.split(expand = True).stack().value_counts()[:50]

acme              721
scan              114
product_launch     56
robinson           37
alex               33
jackson            33
peter              33
antivirus          28
thomson            27
anna               27
anderson           24
jacob              24
smith              24
trojan             24
mcfee              23
johnson            23
scott              22
maria              22
lisa               21
lopez              21
login              19
juan               17
dell               17
saina              17
technician         17
bradley            16
archer             16
mcaffe             16
victoria           15
taylor             15
mitchell           15
karun              15
john               14
wag_bpfl           14
gibson             14
elisabeth          13
malware            13
demos              12
parker             12
brandon            12
google             11
wag_bpgv           11
andrew             11
eric               11
hola               11
erica     

#### ACME, Trojan, Mcfee, Dell are a few notable products that show up in the top 40 Proper noun entity