In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

import pandas as pd
import numpy as np
import scipy
from nltk.corpus import stopwords
import re
import os

In [12]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/scar3crow/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [17]:
data_1 = pd.ExcelFile('R2_RNN_Tranzact_Rohan_1_Regrouped_with_supplier.xlsx')
tranzact_data_1 = data_1.parse(0)

col = ['to_company_name', 'supplier_item_id.2', 'product', 'supplier']
tranzact_data_2 = tranzact_data_1[col]
tranzact_data_2.columns = ['OEM', 'item', 'product', 'supplier']

# Creating separate dataframe for Cesare Bonetti International Pvt. Ltd. since these are unlabelled :

tranzact_data_cesare_international = tranzact_data_2.loc[tranzact_data_2['OEM'] \
                                                               == 'CESARE BONETTI INTERNATIONAL PVT. LTD.']

tranzact_data_2_revised = tranzact_data_2.drop(tranzact_data_cesare_international.index)

tranzact_data_cesare_international = tranzact_data_cesare_international.reset_index(drop=True)
tranzact_data_2_revised = tranzact_data_2_revised.reset_index(drop=True)

## make Items in lower case and removing rows with any cell value = none

tranzact_data_2_revised['item'] = tranzact_data_2_revised['item'].map(lambda x: x if type(x)!=str else x.lower()) 
tranzact_data_2_revised['product'] = tranzact_data_2_revised['product'].map(lambda x: x if type(x)!=str else x.lower())

tranzact_data_3 = tranzact_data_2_revised.dropna() ## remove rows with cell value none

tranzact_data_3 = tranzact_data_3.reset_index(drop=True)

## cleaning the data

pd.options.mode.chained_assignment = None

l_ist = ['mm', 'ft']

## Join all hyphaned words :

tranzact_data_3['item'] = tranzact_data_3['item'].str.replace(r'\b\-\b','')  

## Removing underscores '_':

punctuation = ['_']

for i in punctuation:
    
    tranzact_data_3['item']= tranzact_data_3['item'].str.replace(i," ")

tranzact_data_3['feature'] = tranzact_data_3['item'].str.replace(r'\W',' ')   ## remove puntuations

tranzact_data_3['feature'] = tranzact_data_3['feature'].str.replace(r'\b\d+\b','')  ## remove integers

tranzact_data_3['feature'] = tranzact_data_3['feature'].str.replace(r'\s+',' ') ## remove gaps between words to singe gap

tranzact_data_3['feature'] = tranzact_data_3['feature'].str.replace(r'\b\d+\mm\b',' ')   ## remove all 'mm' dimensions 
 
tranzact_data_3['feature'] = tranzact_data_3['feature'].str.replace(r'\b\m\d+\b',' ') ## remove 'm8' etc

tranzact_data_3['feature'] = tranzact_data_3['feature'].str.replace(r'\b\w\b','') ## remove stand alone single letters


tranzact_data_3['feature'] = tranzact_data_3['feature'].str.replace(r'\s+',' ') ## remove gaps between words to singe gap

tranzact_data_3['feature'] = tranzact_data_3['feature'].str.replace(r'\s+',' ') ## remove gaps between words to singe gap

tranzact_data_3['feature'] = tranzact_data_3['feature'].str.replace(r'\s+',' ') ## remove gaps between words to singe gap

tranzact_data_3['feature'] = tranzact_data_3['feature'].str.replace(r'\s+',' ') ## remove gaps between words to singe gap

tranzact_data_3['feature']= tranzact_data_3['feature'].apply(lambda x:' '.join([i for i in x.split()
                                                                                if i not in l_ist]).lower())

## Removing only single word features :

tranzact_data_3 = tranzact_data_3[tranzact_data_3['feature'].str.contains(' ')]
tranzact_data_3 = tranzact_data_3.reset_index(drop=True)

## Final file for classification :

col_1 = ['product', 'feature', 'supplier']
tranzact_data = tranzact_data_3[col_1]

## Removing duplicate (item + supplier) combinations :

tranzact_data['find_duplicate'] = tranzact_data['feature'].map(str) + \
                                            tranzact_data['supplier'].map(str)

tranzact_data.drop_duplicates(subset='find_duplicate', keep = 'first', inplace = True)
tranzact_data = tranzact_data.reset_index(drop=True)



tranzact_data.head()

Unnamed: 0,product,feature,supplier,find_duplicate
0,gas,new cylinder gas,Stallion Enterprises,new cylinder gasStallion Enterprises
1,cooling,accumulator model aas connection emerson make,Neelam Enterprises,accumulator model aas connection emerson makeN...
2,cooling,accumulator flokool psi fksa596,T J CONTROLS,accumulator flokool psi fksa596T J CONTROLS
3,cooling,accumulator connection,H J International,accumulator connectionH J International
4,cooling,accumulator 52167s,ANAND REFRIGERATION,accumulator 52167sANAND REFRIGERATION


In [5]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent


In [20]:
sent = preprocess(tranzact_data['feature'][1])

#sent = preprocess('Rohan is a good boy')
sent


[('accumulator', 'NN'),
 ('model', 'NN'),
 ('aas', 'JJ'),
 ('connection', 'NN'),
 ('emerson', 'NN'),
 ('make', 'VBP')]

In [21]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  (NP accumulator/NN)
  (NP model/NN)
  (NP aas/JJ connection/NN)
  (NP emerson/NN)
  make/VBP)


In [28]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/scar3crow/nltk_data...


KeyboardInterrupt: 

In [24]:
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.chunk import ne_chunk
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)


[('accumulator', 'NN', 'B-NP'),
 ('model', 'NN', 'B-NP'),
 ('aas', 'JJ', 'B-NP'),
 ('connection', 'NN', 'I-NP'),
 ('emerson', 'NN', 'B-NP'),
 ('make', 'VBP', 'O')]


In [25]:
ne_tree = ne_chunk(pos_tag(word_tokenize(tranzact_data['feature'][1])))
print(ne_tree)

LookupError: 
**********************************************************************
  Resource [93mmaxent_ne_chunker[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('maxent_ne_chunker')
  [0m
  Attempted to load [93mchunkers/maxent_ne_chunker/PY3/english_ace_multiclass.pickle[0m

  Searched in:
    - '/home/scar3crow/nltk_data'
    - '/home/scar3crow/Dropbox/WorkStation-Subrata/python/venv1/nltk_data'
    - '/home/scar3crow/Dropbox/WorkStation-Subrata/python/venv1/share/nltk_data'
    - '/home/scar3crow/Dropbox/WorkStation-Subrata/python/venv1/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************
