In [None]:
# Import Libraries
import pandas as pd
import io
import numpy as np
import re

import nltk
nltk.download('wordnet')
nltk.download('wordnet_ic')
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.corpus import wordnet
from itertools import chain
from nltk.corpus import stopwords


from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv('latest_ticket_data.csv')

In [None]:
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

In [None]:
df['Description'] = df['Description'].apply(lambda x: get_only_chars(x))

In [None]:
df['Tokenized']=[nltk.word_tokenize(i) for i in df['Description']]

In [None]:
df['pos_tagged'] = [nltk.pos_tag(i) for i in df['Tokenized']]

In [None]:
df['NN_tagged'] = df['pos_tagged'].apply(lambda item:[w for w,t in item if t=='NN'])

In [None]:
df['NN_Description'] = df.NN_tagged.map(lambda x: ' '.join(x))

In [None]:
stop_words = stopwords.words('english')
custom_stop_words = ['hi', 'since', 'please', 'best', 'regards', 'thank', 'thanks', 'hello', 'sent', 'great', 'dear', 'help', 'kind']
time_words = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december',
              'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'today' , 'yesterday', 'tomorrow',
              'hour', 'hours', 'time', 'times', 'timelines', 'date', 'day', 'days', 'am', 'pm', 'morning', 'noon', 'afternoon', 'evening',
              'night', 'winter', 'summer', 'rain', 'cold']

def remove_stop_words(text):
    pattern = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b\s*')
    text = pattern.sub('', text)
    return text

def remove_custom_words(text):
    pattern = re.compile(r'\b(' + r'|'.join(custom_stop_words) + r')\b\s*')
    text = pattern.sub('', text)
    return text

def remove_time_words(text):
    pattern = re.compile(r'\b(' + r'|'.join(time_words) + r')\b\s*')
    text = pattern.sub('', text)
    return text

df['NN_Description'] = df["NN_Description"].map(lambda x: remove_stop_words(x))
df['NN_Description'] = df["NN_Description"].map(lambda x: remove_custom_words(x))
df['NN_Description'] = df["NN_Description"].map(lambda x: remove_time_words(x))

In [None]:
LE = LabelEncoder()
df['label'] = LE.fit_transform(df['Category'])

In [None]:
def Get_Label_Synonyms(label):
  result=label.split(" ")
  wordcount=len(result)
  synlist = []
  if(wordcount == 1):
    if(wordnet.synsets(label)):
      synonyms = wordnet.synsets(label)
      synlist = list(set(chain.from_iterable([word.lemma_names() for word in synonyms])))
  else:
    for j in range(0,wordcount):
      label_j = []
      if(wordnet.synsets(result[j])):
        synonyms = wordnet.synsets(result[j])
        label_j = list(set(chain.from_iterable([word.lemma_names() for word in synonyms])))
        synlist = synlist + label_j
  synlist = [item.replace("_"," ") for item in synlist]
  return(synlist)

In [None]:
labelList = df['Category'].unique().tolist()

In [None]:
def getSynonymKeywords(labelList):
  length=len(labelList)
  topics=[]
  for i in range(0,length):
    label_i = Get_Label_Synonyms(labelList[i])
    t = ' '.join(str(x) for x in label_i)
    u = ' '.join(set(t.split()))
    topics.append(u)
  return(topics)

In [None]:
def getKeywords(labelList,keywordsType):
  topics=[]
  if(keywordsType=='synonym'):
    topics=getSynonymKeywords(labelList)
    return(topics)

In [None]:
topics = getKeywords(labelList,'synonym')
print(topics)

['lotion covering diligence program practical applications coating application programme', 'database', 'web net mesh electronic meshwork network meshing', 'sustenance sustainment user drug criminal care maintenance upkeep substance exploiter sustentation alimony abuser', 'protection system measures measure department certificate surety security']


In [None]:
# Generate samples that contains K samples of each class
def gen_sample(df, sample_size, num_classes):

    df_1 = df[(df["label"]<num_classes + 1)].reset_index().drop(["index"], axis=1).reset_index().drop(["index"], axis=1)
    train = df_1[df_1["label"] == np.unique(df_1['label'])[0]].sample(sample_size)

    train_index = train.index.tolist()

    for i in range(1,num_classes):
        train_2 = df_1[df_1["label"] == np.unique(df_1['label'])[i]].sample(sample_size)
        train = pd.concat([train, train_2], axis=0)
        train_index.extend(train_2.index.tolist())

    test = df_1[~df_1.index.isin(train_index)]

    return train, test

In [None]:
def WordnetShortestPath_labelscore(a,topics):
  lowest_netavg=100
  lowest_label=0
  label=-1
  #print(a)
  words = nltk.word_tokenize(a)
  for z in topics:
    total=0
    counter=0
    #print(z)
    for x in z:
      count=0
      avg=0
      sum=0
      for y in words:
        if(wordnet.synsets(x) and wordnet.synsets(y)):
          syn1 = wordnet.synsets(x)[0]
          syn2 = wordnet.synsets(y)[0]
          if(syn1.pos() == 'n' and syn2.pos() == 'n'):
            #print("Shortest path between ",x," and ",y," is: ", syn1.shortest_path_distance(syn2))
            sum=sum+syn1.shortest_path_distance(syn2)
            count=count+1
      if(count==0):
        avg=0
      else:
        avg=sum/count
      total=total+avg
      counter=counter+1
      #print(sum)
      #print(count)
      #print(avg)
    if(counter==0):
      netavg=0
    else:
      netavg=total/counter
    #print(counter)
    #print("Total score for ",z," is: ",total)
    #print("Net average for ",z," is: ",netavg)
    label=label+1
    if(netavg<lowest_netavg):
      lowest_netavg=netavg
      lowest_label=label
      #print("label: ",label," has shortest path value: ",lowest_netavg)
  return lowest_label,lowest_netavg

In [None]:
def WordnetLeacock_labelscore(a,topics):
  lowest_netavg=100
  lowest_label=0
  label=-1
  #print(a)
  words = nltk.word_tokenize(a)
  for z in topics:
    total=0
    counter=0
    #print(z)
    for x in z:
      count=0
      avg=0
      sum=0
      for y in words:
        if(wordnet.synsets(x) and wordnet.synsets(y)):
          syn1 = wordnet.synsets(x)[0]
          syn2 = wordnet.synsets(y)[0]
          if(syn1.pos() == 'n' and syn2.pos() == 'n'):
            #print("Shortest path between ",x," and ",y," is: ", syn1.shortest_path_distance(syn2))
            sum=sum+syn1.lch_similarity(syn2)
            count=count+1
      if(count==0):
        avg=0
      else:
        avg=sum/count
      total=total+avg
      counter=counter+1
      #print(sum)
      #print(count)
      #print(avg)
    if(counter==0):
      netavg=0
    else:
      netavg=total/counter
    #print(counter)
    #print("Total score for ",z," is: ",total)
    #print("Net average for ",z," is: ",netavg)
    label=label+1
    if(netavg<lowest_netavg):
      lowest_netavg=netavg
      lowest_label=label
      #print("label: ",label," has shortest path value: ",lowest_netavg)
  return lowest_label,lowest_netavg

In [None]:
def WordnetRES_labelscore(a,topics):
  highest_netavg=0
  highest_label=0
  label=-1
  #print(a)
  words = nltk.word_tokenize(a)
  for z in topics:
    total=0
    counter=0
    #print(z)
    for x in z:
      count=0
      avg=0
      sum=0
      for y in words:
        if(wordnet.synsets(x) and wordnet.synsets(y)):
          syn1 = wordnet.synsets(x)[0]
          syn2 = wordnet.synsets(y)[0]
          if(syn1.pos() == 'n' and syn2.pos() == 'n'):
            sum=sum+syn1.res_similarity(syn2, brown_ic)
            count=count+1
      if(count==0):
        avg=0
      else:
        avg=sum/count
      total=total+avg
      counter=counter+1
      #print(sum)
      #print(count)
      #print(avg)
    if(counter==0):
      netavg=0
    else:
      netavg=total/counter
    #print(counter)
    #print("Total score for ",z," is: ",total)
    #print("Net average for ",z," is: ",netavg)
    label=label+1
    if(netavg>highest_netavg):
      highest_netavg=netavg
      highest_label=label
      #print("label: ",label," has shortest path value: ",lowest_netavg)
  return highest_label,highest_netavg

In [None]:
def WordnetLIN_labelscore(a,topics):
  highest_netavg=0
  highest_label=0
  label=-1
  #print(a)
  words = nltk.word_tokenize(a)
  for z in topics:
    total=0
    counter=0
    #print(z)
    for x in z:
      count=0
      avg=0
      sum=0
      for y in words:
        if(wordnet.synsets(x) and wordnet.synsets(y)):
          syn1 = wordnet.synsets(x)[0]
          syn2 = wordnet.synsets(y)[0]
          if(syn1.pos() == 'n' and syn2.pos() == 'n'):
            #print("Shortest path between ",x," and ",y," is: ", syn1.wup_similarity(syn2))
            sum=sum+syn1.lin_similarity(syn2, brown_ic)
            count=count+1
      if(count==0):
        avg=0
      else:
        avg=sum/count
      total=total+avg
      counter=counter+1
      #print(sum)
      #print(count)
      #print(avg)
    if(counter==0):
      netavg=0
    else:
      netavg=total/counter
    #print(counter)
    #print("Total score for ",z," is: ",total)
    #print("Net average for ",z," is: ",netavg)
    label=label+1
    if(netavg>highest_netavg):
      highest_netavg=netavg
      highest_label=label
      #print("label: ",label," has shortest path value: ",lowest_netavg)
  return highest_label,highest_netavg

In [None]:
def WordnetJCN_labelscore(a,topics):
  highest_netavg=0
  highest_label=0
  label=-1
  #print(a)
  words = nltk.word_tokenize(a)
  for z in topics:
    total=0
    counter=0
    #print(z)
    for x in z:
      count=0
      avg=0
      sum=0
      for y in words:
        if(wordnet.synsets(x) and wordnet.synsets(y)):
          syn1 = wordnet.synsets(x)[0]
          syn2 = wordnet.synsets(y)[0]
          if(syn1.pos() == 'n' and syn2.pos() == 'n'):
            #print("Shortest path between ",x," and ",y," is: ", syn1.wup_similarity(syn2))
            sum=sum+syn1.jcn_similarity(syn2, brown_ic)
            count=count+1
      if(count==0):
        avg=0
      else:
        avg=sum/count
      total=total+avg
      counter=counter+1
      #print(sum)
      #print(count)
      #print(avg)
    if(counter==0):
      netavg=0
    else:
      netavg=total/counter
    #print(counter)
    #print("Total score for ",z," is: ",total)
    #print("Net average for ",z," is: ",netavg)
    label=label+1
    if(netavg>highest_netavg):
      highest_netavg=netavg
      highest_label=label
      #print("label: ",label," has shortest path value: ",lowest_netavg)
  return highest_label,highest_netavg

In [None]:
def WordnetWUP_labelscore(a,topics):
  highest_netavg=0
  highest_label=0
  label=-1
  #print(a)
  words = nltk.word_tokenize(a)
  for z in topics:
    total=0
    counter=0
    #print(z)
    for x in z:
      count=0
      avg=0
      sum=0
      for y in words:
        if(wordnet.synsets(x) and wordnet.synsets(y)):
          syn1 = wordnet.synsets(x)[0]
          syn2 = wordnet.synsets(y)[0]
          if(syn1.pos() == 'n' and syn2.pos() == 'n'):
            #print("Shortest path between ",x," and ",y," is: ", syn1.wup_similarity(syn2))
            sum=sum+syn1.wup_similarity(syn2)
            count=count+1
      if(count==0):
        avg=0
      else:
        avg=sum/count
      total=total+avg
      counter=counter+1
      #print(sum)
      #print(count)
      #print(avg)
    if(counter==0):
      netavg=0
    else:
      netavg=total/counter
    #print(counter)
    #print("Total score for ",z," is: ",total)
    #print("Net average for ",z," is: ",netavg)
    label=label+1
    if(netavg>highest_netavg):
      highest_netavg=netavg
      highest_label=label
      #print("label: ",label," has shortest path value: ",lowest_netavg)
  return highest_label,highest_netavg

In [None]:
def ZeroShotWordnetModel(text, labelList, keywordstobeGenerated='Yes', keywordsList=[], keywordsType='synonym', posfilterType='NN', measureType='ShortestPath'):
  if not text and not labelList:
    print("Input Text and List of Label/Category names required")
    return()
  else:
    if(keywordstobeGenerated=='Yes'):
      topics = getKeywords(labelList,keywordsType)
    else:
      topics = labelList
    if(posfilterType=='NN'):
      if(measureType=='WuPalmer'):
        return(WordnetWUP_labelscore(text,topics))
      if(measureType=='Resnik'):
        return(WordnetRES_labelscore(text,topics))
      if(measureType=='JCN'):
        return(WordnetJCN_labelscore(text,topics))
      if(measureType=='Lin'):
        return(WordnetLIN_labelscore(text,topics))
      if(measureType=='Leacock'):
        return(WordnetLeacock_labelscore(text,topics))
      if(measureType=='ShortestPath'):
        return(WordnetShortestPath_labelscore(text,topics))


In [None]:
data, rest = gen_sample(df, 100, 5)

In [None]:
df[['NN_Leacock_Label', 'NN_Leacock_Value']] = df['NN_Description'].apply(lambda x: pd.Series(ZeroShotWordnetModel(x, labelList,measureType='Leacock')))

In [None]:
df["NN_Leacock_Label"] = df["NN_Leacock_Label"].astype(int)

In [None]:
accuracy_score(df['label'], df['NN_Leacock_Label'])

0.20066666666666666

In [None]:
df[['NN_Lin_Label', 'NN_Lin_Value']] = df['NN_Description'].apply(lambda x: pd.Series(ZeroShotWordnetModel(x, labelList,measureType='Lin')))

In [None]:
df["NN_Lin_Label"] = df["NN_Lin_Label"].astype(int)

In [None]:
accuracy_score(df['label'], df['NN_Lin_Label'])

0.22066666666666668

In [None]:
df[['NN_JCN_Label', 'NN_JCN_Value']] = df['NN_Description'].apply(lambda x: pd.Series(ZeroShotWordnetModel(x, labelList,measureType='JCN')))

In [None]:
df["NN_JCN_Label"] = df["NN_JCN_Label"].astype(int)

In [None]:
accuracy_score(df['label'], df['NN_JCN_Label'])

0.21133333333333335

In [None]:
df[['NN_Resnik_Label', 'NN_Resnik_Value']] = df['NN_Description'].apply(lambda x: pd.Series(ZeroShotWordnetModel(x, labelList,measureType='Resnik')))

In [None]:
df["NN_Resnik_Label"] = df["NN_Resnik_Label"].astype(int)

In [None]:
accuracy_score(df['label'], df['NN_Resnik_Label'])

0.248

In [None]:
df[['NN_WUP_Label', 'NN_WUP_Value']] = df['NN_Description'].apply(lambda x: pd.Series(ZeroShotWordnetModel(x, labelList,measureType='WuPalmer')))

In [None]:
df["NN_WUP_Label"] = df["NN_WUP_Label"].astype(int)

In [None]:
accuracy_score(df['label'], df['NN_WUP_Label'])

0.228

In [None]:
df[['NN_WSP_Label', 'NN_WSP_Value']] = df['NN_Description'].apply(lambda x: pd.Series(ZeroShotWordnetModel(x, labelList)))

In [None]:
df["NN_WSP_Label"] = df["NN_WSP_Label"].astype(int)

In [None]:
accuracy_score(df['label'], df['NN_WSP_Label'])

0.20833333333333334