In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PRATHAM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from nltk.tokenize import word_tokenize
    
article = "The first ever iPhone was launched in 2007 by Apple Inc. The iPhone revolutionized the mobile phone industry and brought a new level of technology to the market. The iPhone had a large impact on the way people use their phones and changed the way people communicate with each other. Today, there are several different models of iPhones available with even more advanced features and capabilities"
article_tokens = word_tokenize(article)
print(article_tokens)

['The', 'first', 'ever', 'iPhone', 'was', 'launched', 'in', '2007', 'by', 'Apple', 'Inc', '.', 'The', 'iPhone', 'revolutionized', 'the', 'mobile', 'phone', 'industry', 'and', 'brought', 'a', 'new', 'level', 'of', 'technology', 'to', 'the', 'market', '.', 'The', 'iPhone', 'had', 'a', 'large', 'impact', 'on', 'the', 'way', 'people', 'use', 'their', 'phones', 'and', 'changed', 'the', 'way', 'people', 'communicate', 'with', 'each', 'other', '.', 'Today', ',', 'there', 'are', 'several', 'different', 'models', 'of', 'iPhones', 'available', 'with', 'even', 'more', 'advanced', 'features', 'and', 'capabilities']


In [3]:
#lowering the case :

lower_article_tokens = [token.lower() for token in article_tokens]

In [4]:
#remove punctuations:

import string

def remove_punctuation(text):
    # create a translation table to remove punctuation and special characters
    table = str.maketrans('', '', string.punctuation)
    
    # remove the punctuation and special characters from the text
    cleaned_text = text.translate(table)
    
    return cleaned_text

# remove punctuation and special characters from article_tokens
p_removed_article_tokens = [remove_punctuation(token) for token in lower_article_tokens]

In [5]:
#removing stopwords

from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

filtered_article_tokens = [word for word in p_removed_article_tokens if word.lower() not in stop_words]
# filtered_highlights_tokens = [word for word in p_removed_highlights_tokens if word.lower() not in stop_words]


In [6]:
#stemming

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

article_stemmed_words = [stemmer.stem(word) for word in filtered_article_tokens]
# highlights_stemmed_words = [stemmer.stem(word) for word in filtered_article_tokens]

In [7]:

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

article_lemmatized_words = [lemmatizer.lemmatize(word) for word in article_stemmed_words]
# highlights_lemmatized_words = [lemmatizer.lemmatize(word) for word in highlights_stemmed_words]


In [8]:
#WSD implementation

import nltk
from nltk.corpus import wordnet

def adapted_lesk(word, context, pos=None):
    word = word.lower()
    context = [w.lower() for w in context]
    synsets = wordnet.synsets(word)
    if pos is None:
        signatures = synsets
    else:
        signatures = [synset for synset in synsets if synset.pos() == pos]
    best_sense = None
    max_overlap = 0
    for signature in signatures:
        definition = signature.definition().split()
        definition = [w.lower() for w in definition if w.isalpha()]
        definition_synonyms = []
        for syn in signature.lemmas():
            definition_synonyms.append(syn.name().lower())
        definition_synonyms.append(signature.lemma_names()[0].lower())
        overlap = set(definition).intersection(context)
        if len(overlap) > max_overlap:
            max_overlap = len(overlap)
            best_sense = signature
        if len(overlap) == max_overlap:
            definition_len = len(definition)
            if best_sense:
                best_sense_len = len(best_sense.definition().split())
                if definition_len < best_sense_len:
                    best_sense = signature
            else:
                best_sense = signature
    return best_sense


disambiguated_article_tokens = []
for token in article_tokens:
    sense = adapted_lesk(token, article_lemmatized_words)
    if sense is not None:
        disambiguated_article_tokens.append(sense)

# disambiguated_highlights_tokens = []
# for token in highlights_tokens:
#     sense = adapted_lesk(token, highlights_lemmatized_words)
#     if sense is not None:
#         disambiguated_highlights_tokens.append(sense)

article_sense = []
for word in article_lemmatized_words:
    sense = adapted_lesk(word, article)
    if sense is not None:
        article_sense.append(sense)
    
# highlights_sense = []
# for word in highlights_lemmatized_words:
#     sense = adapted_lesk(word, highlights)
#     if sense is not None:
#         highlights_sense.append(sense)


In [9]:
article_sense

[Synset('first.n.02'),
 Synset('ever.r.03'),
 Synset('launch.v.03'),
 Synset('iraqi_national_congress.n.01'),
 Synset('rotation.n.03'),
 Synset('phone.n.02'),
 Synset('lend.v.01'),
 Synset('newfangled.s.01'),
 Synset('charge.v.17'),
 Synset('market.v.03'),
 Synset('impact.n.02'),
 Synset('way.n.05'),
 Synset('use.n.03'),
 Synset('phone.n.02'),
 Synset('chang_jiang.n.01'),
 Synset('way.n.05'),
 Synset('nowadays.r.01'),
 Synset('discerp.v.02'),
 Synset('differ.v.01'),
 Synset('model.n.02'),
 Synset('avail.n.01'),
 Synset('tied.s.05')]

In [10]:
# FOR ARTICLE

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

def get_wordnet_pos(pos_tag):
    if pos_tag[1].startswith('J'):
        return (pos_tag[0], wn.ADJ)
    elif pos_tag[1].startswith('V'):
        return (pos_tag[0], wn.VERB)
    elif pos_tag[1].startswith('N'):
        return (pos_tag[0], wn.NOUN)
    elif pos_tag[1].startswith('R'):
        return (pos_tag[0], wn.ADV)
    else:
        return (pos_tag[0], wn.NOUN)

def preprocess(article):
    # Named Entity Recognition
    entities = nltk.ne_chunk(nltk.pos_tag(word_tokenize(article)))
    entity_list = []
    for entity in entities:
        if hasattr(entity, 'label'):
            entity_list.append((entity.label(), " ".join(c[0] for c in entity.leaves())))
    # Frequency Distribution
    words = nltk.word_tokenize(article)
    fdist = nltk.FreqDist(words)
    # Taxonomy
    tagged_words = pos_tag(words)
    wnl = WordNetLemmatizer()
    lemmatized_words = [wnl.lemmatize(*get_wordnet_pos(tagged_word)) for tagged_word in tagged_words]
    # θd
    theta_d = {}
    for word, pos in tagged_words:
        synonyms = wn.synsets(word, pos=wn.NOUN if pos.startswith("N") else (wn.ADJ if pos.startswith("J") else (wn.VERB if pos.startswith("V") else (wn.ADV if pos.startswith("R") else wn.NOUN))))
        theta_d[word] = set(syn.lemmas()[0].name() for syn in synonyms)
    # θf
    theta_f = {}
    for word, pos in tagged_words:
        theta_f[word] = fdist[word] / float(len(words))
    return (entities, entity_list, fdist, lemmatized_words, theta_d, theta_f)


article_data = preprocess(article)


named_entities = article_data[1]
Frequency_Distribution = article_data[2].most_common(10)
Taxonomy = article_data[3]
fi_d = article_data[4]
fi_f = article_data[5]

# Print Named Entities
print("Named Entities:", article_data[1])
# Print Frequency Distribution
print("\nFrequency Distribution:", article_data[2].most_common(10))
# Print Taxonomy
print("\nTaxonomy:", article_data[3])
# Print θd
print("\nθd:", article_data[4])
# Print θf
print("\nθf:", article_data[5])


Named Entities: [('ORGANIZATION', 'iPhone'), ('PERSON', 'Apple Inc'), ('ORGANIZATION', 'iPhone'), ('ORGANIZATION', 'iPhone'), ('ORGANIZATION', 'iPhones')]

Frequency Distribution: [('the', 4), ('The', 3), ('iPhone', 3), ('.', 3), ('and', 3), ('a', 2), ('of', 2), ('way', 2), ('people', 2), ('with', 2)]

Taxonomy: ['The', 'first', 'ever', 'iPhone', 'be', 'launch', 'in', '2007', 'by', 'Apple', 'Inc', '.', 'The', 'iPhone', 'revolutionize', 'the', 'mobile', 'phone', 'industry', 'and', 'bring', 'a', 'new', 'level', 'of', 'technology', 'to', 'the', 'market', '.', 'The', 'iPhone', 'have', 'a', 'large', 'impact', 'on', 'the', 'way', 'people', 'use', 'their', 'phone', 'and', 'change', 'the', 'way', 'people', 'communicate', 'with', 'each', 'other', '.', 'Today', ',', 'there', 'be', 'several', 'different', 'model', 'of', 'iPhones', 'available', 'with', 'even', 'more', 'advanced', 'feature', 'and', 'capability']

θd: {'The': set(), 'first': {'beginning', 'first', 'inaugural'}, 'ever': {'ever', 'alw

In [11]:
Frequency_Distribution

[('the', 4),
 ('The', 3),
 ('iPhone', 3),
 ('.', 3),
 ('and', 3),
 ('a', 2),
 ('of', 2),
 ('way', 2),
 ('people', 2),
 ('with', 2)]

In [13]:
text = article
E = named_entities
F = Frequency_Distribution
T = Taxonomy
θf = 2
θd = 3


def Level_driven_text_Generalization(text, wsdText, F, T, θd, θf):
    genText = text
    tokens = text.split()
    for i, token in enumerate(tokens):
        if i >= len(wsdText):
            break
        c = wsdText[i]
        synsets = wordnet._synset_from_pos_and_offset(c.pos(), c.offset())
        Pc = synsets.hypernym_paths()
        dc = synsets.max_depth()
        fc = next((f[1] for f in F if f[0] == synsets.lemmas()[0].name()), 0)
        for path in Pc:
            for c in path:
                dc = c.max_depth()
                fc = next((f[1] for f in F if f[0] == c.lemmas()[0].name()), 0)
                if fc >= θf or dc <= θd:
                    break
            if fc >= θf or dc <= θd:
                break
        if c.lemmas()[0].name() != token:
            genText = genText.replace(token, c.lemmas()[0].name())
            F = [(f[0], f[1]-1) if f[0] == token else f for f in F]
            F = [(f[0], f[1]+1) if f[0] == c.lemmas()[0].name() else f for f in F]
    return genText

def Named_entities_driven_text_Generalization(text, E, F, θf):
    genText = text
    for namedEntity, entity in E:
        if next((f[1] for f in F if f[0] == entity), 0) < θf:
            genText = genText.replace(namedEntity, entity)
    return genText

def WSD_Based_Combination_of_NEG_and_LG(text, wsdText, E, F, T, θf, θd):
    genText = Named_entities_driven_text_Generalization(text, E, F, θf)
    genText = Level_driven_text_Generalization(genText, wsdText, F, T, θd, θf)
    return genText

result = WSD_Based_Combination_of_NEG_and_LG(text, article_sense, E, F, T, θf, θd)
print(result)


entity move move entity entity entity chentitynge tiedfentityngled move entityct entity entity entity entity entity entity nowentitydentityys chentityngedustry differ entity entity tied level of technology to entity mentityrket. entity entity hentityd entity lentityrge impentityct on entity wentityy people use entityir nowentitydentityyss differ chentitynged entity wentityy people communicentityte with eentitych oentityr. Todentityy, entityre entityre smoveentityl different models of entitys entityventityilentityble with even more entitydventitynced feentitytures differ centitypentitybilities


  Function _synset_from_pos_and_offset() has been deprecated.  Use
  public method synset_from_pos_and_offset() instead
  synsets = wordnet._synset_from_pos_and_offset(c.pos(), c.offset())
