In [1]:
!pip3 install yake



In [2]:
# import libraries
import numpy as np
import pandas as pd
import torch
import yake
import json
import re

import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import spacy
nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# We need 3 key words per line of poem
# So ngram size is 1 (n = 1)
# So we need top 3 words (top = 3)
# To promote diversity in words we dont want repeated words (dedupLim= 1)

algorithm = yake.KeywordExtractor(n = 1,
                                  top =3,
                                  dedupLim= 1)

In [4]:
file_1 ='ready_poems.csv'
file_2 ='news_summaries.csv'
file_3 ='roc.csv'

df_1 = pd.read_csv(file_1)
df_2 = pd.read_csv(file_2)
df_3 = pd.read_csv(file_3)

In [5]:
# Find the columns with "sentence" in the title
sentence_columns = [col for col in df_3.columns if 'sentence' in col.lower()]

# Combine the selected columns into a new column
df_3['text'] = df_3[sentence_columns].apply(lambda row: ' '.join(row), axis=1)

In [6]:
df_1.rename(columns={'content': 'text'}, inplace=True)
df_2.rename(columns={'headlines': 'title'}, inplace=True)
df_3.rename(columns={'storytitle': 'title'}, inplace=True)

In [7]:
df_1 = df_1[['title', 'text']]
df_1.head()

Unnamed: 0,title,text
0,Do not go gentle into that good night,do not go gentle into that good night NEWLINE ...
1,How Do I Love Thee? (Sonnet 43),how do i love thee let me count the ways NEWLI...
2,Shall I compare thee to a summer’s day? (Sonne...,shall i compare thee to a summers day NEWLINE ...
3,If—,if you can keep your head when all about you N...
4,Nothing Gold Can Stay,natures first green is gold NEWLINE her hardes...


In [8]:
df_2 = df_2[['title', 'text']]
df_2.head()

Unnamed: 0,title,text
0,"CBI books Rajasthan CM's brother, 14 others in...",The CBI has registered a case against Rajastha...
1,Fan shares selfie taken by SRK with his father...,Twitter user Rohan Mukherjee shared that his f...
2,European Commission recommends Ukraine be gran...,The European Commission on Friday recommended ...
3,Jos Buttler smashes England's second-fastest O...,England's Jos Buttler on Friday smashed the se...
4,Paytm CEO Vijay Shekhar buys 1.7 lakh shares o...,Paytm Founder and CEO Vijay Shekhar Sharma has...


In [9]:
df_3 = df_3[['title', 'text']]
df_3.head()

Unnamed: 0,title,text
0,David Drops the Weight,David noticed he had put on a lot of weight re...
1,Frustration,Tom had a very short temper. One day a guest m...
2,Marcus Buys Khakis,Marcus needed clothing for a business casual e...
3,Different Opinions,Bobby thought Bill should buy a trailer and ha...
4,Overcoming shortcomings,John was a pastor with a very bad memory. He t...


**Generating Keywords for Dataset 1: Poetry**

In [25]:
titles = df_1['title']
texts = df_1['text']

In [26]:
def has_proper_noun(text):
    # Tokenize the text and get part-of-speech tags
    words = word_tokenize(text)
    pos_tags = pos_tag(words)

    # Check if any word is tagged as a proper noun (NNP or NNPS)
    return any(pos == 'NNP' or pos == 'NNPS' for _, pos in pos_tags)

In [27]:
tokens = []
count = 0
text_data = []
title_data = []
sentences = []

for sen,tit in zip(texts, titles):
    tit = re.sub(r'[^a-zA-Z\s]', '', tit)
    tit = tit.lower()
    tokenized = word_tokenize(tit)
    pos = has_proper_noun(tokenized)

    if len(tokenized) <= 2 or pos:
        continue
    try:

        sen = re.sub(r'[^a-zA-Z\s.]', '', sen)
        sen = sen.replace(' NEWLINE', '.')
        token_len = len(sen.split())
        sen_len = len(sen.split('.'))

        if 5<=sen_len<=50:
            count += 1
            tokens.append(token_len)
            text_data.append(sen)
            title_data.append(tit)
            sentences.append(sen_len)
    except:
        continue


print(count)
print('average tokens of story:', np.mean(tokens))
print('average sentences of story:', np.mean(sentences))
print('average tokens per sentence:', np.mean(tokens)/np.mean(sentences))

[Synset('bash.n.02'), Synset('do.n.02'), Synset('doctor_of_osteopathy.n.01'), Synset('make.v.01'), Synset('perform.v.01'), Synset('do.v.03'), Synset('do.v.04'), Synset('cause.v.01'), Synset('practice.v.01'), Synset('suffice.v.01'), Synset('do.v.08'), Synset('act.v.02'), Synset('serve.v.09'), Synset('do.v.11'), Synset('dress.v.16'), Synset('do.v.13')]
[Synset('not.r.01')]
[Synset('go.n.01'), Synset('adam.n.03'), Synset('crack.n.09'), Synset('go.n.04'), Synset('travel.v.01'), Synset('go.v.02'), Synset('go.v.03'), Synset('become.v.01'), Synset('go.v.05'), Synset('run.v.05'), Synset('run.v.03'), Synset('proceed.v.04'), Synset('go.v.09'), Synset('go.v.10'), Synset('sound.v.02'), Synset('function.v.01'), Synset('run_low.v.01'), Synset('move.v.13'), Synset('survive.v.01'), Synset('go.v.16'), Synset('die.v.01'), Synset('belong.v.03'), Synset('go.v.19'), Synset('start.v.09'), Synset('move.v.15'), Synset('go.v.22'), Synset('go.v.23'), Synset('blend.v.02'), Synset('go.v.25'), Synset('fit.v.02'), 

KeyboardInterrupt: ignored

In [54]:
len(title_data), len(text_data)

(7000, 7000)

In [42]:
title_data = title_data[:7000]
text_data = text_data[:7000]

In [55]:
len(text_data), len(title_data)

(7000, 7000)

In [57]:
poems = []
for sen, tit in zip(text_data, title_data):

    data = {}
    data['Theme'] = tit
    sents = sen.split('.')
    final_keywords = []
    for sent in sents:
        yake = algorithm.extract_keywords(sent)
        keywords = [kw[1] for kw in yake]
        if len(keywords) >= 2:
            final_keywords.append(keywords)

    data['keywords'] = final_keywords
    poems.append(data)
    if len(poems) % 500 == 1:

        json.dump(poems, open('poetry_yake_keywords''.json','w'))

**Generating Keywords for Dataset 2: News**

In [10]:
titles = df_2['title']
texts = df_2['text']

In [11]:
def has_proper_noun(text):
    # Tokenize the text and get part-of-speech tags
    words = word_tokenize(text)
    pos_tags = pos_tag(words)

    # Check if any word is tagged as a proper noun (NNP or NNPS)
    return any(pos == 'NNP' or pos == 'NNPS' for _, pos in pos_tags)

In [12]:
tokens = []
count = 0
text_data = []
title_data = []
sentences = []

for sen,tit in zip(texts, titles):
    tit = re.sub(r'[^a-zA-Z\s]', '', tit)
    tit = tit.lower()
    tokenized = word_tokenize(tit)
    pos = has_proper_noun(tokenized)

    if len(tokenized) <= 2 or pos:
        continue
    try:

        sen = re.sub(r'[^a-zA-Z\s.]', '', sen)
        sen = sen.lower()
        sen = sen.replace(' NEWLINE', '.')
        token_len = len(sen.split())
        sen_len = len(sen.split('.'))

        if 5<=sen_len<=50:
            count += 1
            tokens.append(token_len)
            text_data.append(sen)
            title_data.append(tit)
            sentences.append(sen_len)
    except:
        continue


print(count)
print('average tokens of story:', np.mean(tokens))
print('average sentences of story:', np.mean(sentences))
print('average tokens per sentence:', np.mean(tokens)/np.mean(sentences))

15887
average tokens of story: 57.77856108768175
average sentences of story: 7.377163718763769
average tokens per sentence: 7.832083343998772


In [13]:
len(title_data), len(text_data)

(15887, 15887)

In [15]:
title_data = title_data[:5000]
text_data = text_data[:5000]

In [16]:
len(text_data), len(title_data)

(5000, 5000)

In [17]:
news = []
for sen, tit in zip(text_data, title_data):

    data = {}
    data['Theme'] = tit
    sents = sen.split('.')
    final_keywords = []
    for sent in sents:
        yake = algorithm.extract_keywords(sent)
        keywords = [kw[1] for kw in yake]
        if len(keywords) >= 2:
            final_keywords.append(keywords)

    data['keywords'] = final_keywords
    news.append(data)
    if len(news) % 500 == 1:

        json.dump(news, open('news_yake_keywords''.json','w'))

**Generating Keywords for Dataset 3: ROC**

In [18]:
titles = df_3['title']
texts = df_3['text']

In [19]:
def has_proper_noun(text):
    # Tokenize the text and get part-of-speech tags
    words = word_tokenize(text)
    pos_tags = pos_tag(words)

    # Check if any word is tagged as a proper noun (NNP or NNPS)
    return any(pos == 'NNP' or pos == 'NNPS' for _, pos in pos_tags)

In [20]:
tokens = []
count = 0
text_data = []
title_data = []
sentences = []

for sen,tit in zip(texts, titles):
    tit = re.sub(r'[^a-zA-Z\s]', '', tit)
    tit = tit.lower()
    tokenized = word_tokenize(tit)
    pos = has_proper_noun(tokenized)

    if len(tokenized) <= 2 or pos:
        continue
    try:

        sen = re.sub(r'[^a-zA-Z\s.]', '', sen)
        sen = sen.lower()
        sen = sen.replace(' NEWLINE', '.')
        token_len = len(sen.split())
        sen_len = len(sen.split('.'))

        if 5<=sen_len<=50:
            count += 1
            tokens.append(token_len)
            text_data.append(sen)
            title_data.append(tit)
            sentences.append(sen_len)
    except:
        continue


print(count)
print('average tokens of story:', np.mean(tokens))
print('average sentences of story:', np.mean(sentences))
print('average tokens per sentence:', np.mean(tokens)/np.mean(sentences))

14587
average tokens of story: 44.09350791800919
average sentences of story: 5.923424967436759
average tokens per sentence: 7.443921069382559


In [21]:
len(title_data), len(text_data)

(14587, 14587)

In [22]:
title_data = title_data[:7000]
text_data = text_data[:7000]

In [23]:
len(text_data), len(title_data)

(7000, 7000)

In [24]:
roc = []
for sen, tit in zip(text_data, title_data):

    data = {}
    data['Theme'] = tit
    sents = sen.split('.')
    final_keywords = []
    for sent in sents:
        yake = algorithm.extract_keywords(sent)
        keywords = [kw[1] for kw in yake]
        if len(keywords) >= 2:
            final_keywords.append(keywords)

    data['keywords'] = final_keywords
    roc.append(data)
    if len(roc) % 500 == 1:

        json.dump(roc, open('roc_yake_keywords''.json','w'))