In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
data_path = "/content/gdrive/My Drive/Data" 
model_path = "/content/gdrive/My Drive/Models"

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [None]:
data = pd.read_csv(data_path + "/ner_dataset.csv", encoding="latin1").fillna(method="ffill")

In [None]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [None]:
class SentGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sents = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence:", self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentGetter(data)

In [None]:
sents = [[col[0] for col in sent] for sent in getter.sents]
labels = [[col[2] for col in sent] for sent in getter.sents]

In [None]:
print(sents[0])
print(labels[0])

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [None]:
def replace_tags(labels):
  return [['B' if l.startswith('B') else 'I' if l.startswith('I') else 'O' for l in label] for label in labels]

In [None]:
labels = replace_tags(labels)

In [None]:
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O']


In [None]:
tag_values = list(set([l for label in labels for l in label]))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
tag2idx

{'B': 0, 'I': 2, 'O': 1, 'PAD': 3}

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
n_gram_range = (1, 2)
stop_words = "english"

count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words, lowercase=False).fit([" ".join(sents[0])])
candidates = count.get_feature_names()

In [None]:
candidates

['British',
 'British troops',
 'Iraq',
 'Iraq demand',
 'London',
 'London protest',
 'Thousands',
 'Thousands demonstrators',
 'country',
 'demand',
 'demand withdrawal',
 'demonstrators',
 'demonstrators marched',
 'marched',
 'marched London',
 'protest',
 'protest war',
 'troops',
 'troops country',
 'war',
 'war Iraq',
 'withdrawal',
 'withdrawal British']

In [None]:
pip install transformers



In [None]:
import torch
from transformers import BertTokenizer, BertConfig

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [None]:
from transformers import BertModel

In [None]:
bert = BertModel.from_pretrained('bert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




In [None]:
doc_emb = bert(torch.tensor([tokenizer.encode(sents[0])]))[1]

In [None]:
torch.tensor()

ValueError: ignored

In [None]:
candidates_emb = bert(torch.tensor([tokenizer.encode(c) for c in candidates]))[1]

ValueError: ignored

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import itertools

In [None]:
top_n = 20
distances = cosine_similarity(doc_emb.detach().numpy(), candidates_emb.detach().numpy())
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

IndexError: ignored

In [None]:
distances.argsort()[0][-10:]

array([15,  2, 12, 22, 24,  9,  5, 10,  0,  1])

In [None]:
len(candidates)

23

In [None]:
keywords = [candidates[i] for i in distances.argsort()[0][-10:]]

IndexError: ignored

In [None]:
pip install sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f5/5a/6e41e8383913dd2ba923cdcd02be2e03911595f4d2f9de559ecbed80d2d3/sentence-transformers-0.3.9.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 3.4MB/s 
[?25hCollecting transformers<3.6.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 7.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 40.7MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K  

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import itertools

In [None]:
def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [None]:
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [None]:
doc = """Are Some Kids Overscheduled?
Experts debate pros, cons of highly programmed childhood THURSDAY, March 31 (HealthDay News) - From sports practices to music lessons to community service, American kids always seem to have plenty to keep them busy. But whether they're actually too busy - reaching a tipping point detrimental to their mental and physical health - remains a topic of debate. The subject of overscheduled children has been on scientists' radar for at least a decade, said Andrea Mata, a doctoral student at Kent State University whose recent study on highly involved children was scheduled for presentation Thursday at a symposium in Montreal run by the Society for Research in Child Development. ""I think it's a hot topic right now,"" Mata said. ""There's definitely a mix of viewpoints. So I think a lot more research is needed to find out what's going on."" The SRCD symposium will examine which children and adolescents become overscheduled, what happens at high levels of extracurricular involvement, and how factors such as school grades and aggression levels are affected. Between 70 percent and 83 percent of American children and teens claim to take part in at least one extracurricular pursuit, spending an average of five to nine hours per week in structured activities, according to the SRCD. Only 5 percent to 7 percent, however, devote more than 20 hours per week to these activities. Jean Twenge, author of the book Generation Me and a professor of psychology at San Diego State University, said data gathered between the 1950s and the 1990s indicated overscheduling rose during that period and then leveled off. ""Are kids really overscheduled? It's not the average experience, but that doesn't mean it's not a problem,"" Twenge said. ""Parents worry about keeping up, but it's certain types of parents who worry about it."" Twenge said the ever-mounting competition for admission to the nation's top colleges compels some parents and kids to fill every spare hour with impressive-looking endeavors. Mata's study followed 1354 children from birth through age 15, dividing them into groups based on how involved they were outside of school and home. The 43 children in the highest activity level averaged 129 minutes per week of structured activities at kindergarten, which increased to 254 minutes weekly by fifth grade. Highly involved children were more likely to be girls from more affluent families, Mata said, and their mothers had attained higher education levels. This group had higher grades and lower levels of delinquency, among other behavioral and academic measurements, compared to less-involved children, she said. ""We're looking at it in a much more positive way,"" Mata said. ""These highly involved kids are highly adaptive and high-functioning."" Linda Balog, former executive director of the Child and Adolescent Stress Management Institute at State University of New York at Brockport, said parents should ask their children how they feel about their extracurricular pursuits and whether they feel overwhelmed and stressed. ""We see some kids forced into organized sports at early ages and then get so burned out that they opt not to play in high school,"" said Balog, an associate professor of health sciences who's teaching a course on child and adolescent stress. ""Sometimes parents live through their children - a sort of surrogate self,"" she added. ""I think we have to err on the side of backing off a bit . . . as opposed to everything being organized and structured."" Experts note that research presented at meetings is considered preliminary until it is published in a peer-reviewed journal. """

In [None]:
n_gram_range = (1, 1)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names()

In [None]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

100%|██████████| 245M/245M [00:34<00:00, 7.16MB/s]


In [None]:
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [None]:
top_n = 20
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [None]:
keywords

['competition',
 'hour',
 'montreal',
 'parents',
 '1990s',
 'symposium',
 'university',
 'mothers',
 'week',
 'decade',
 'academic',
 'psychology',
 '1950s',
 '1354',
 'professor',
 'doctoral',
 'healthday',
 'scientists',
 'weekly',
 'thursday']

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=10, nr_candidates=20)

['competition',
 'montreal',
 'parents',
 'psychology',
 '1950s',
 '1354',
 'healthday',
 'scientists',
 'weekly',
 'thursday']

In [None]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=30, diversity=0.7)

['thursday',
 'kindergarten',
 'scientists',
 'burned',
 'healthday',
 '1950s',
 'montreal',
 'mothers',
 '1990s',
 'psychology',
 'competition',
 'american',
 'book',
 'affluent',
 'news',
 'symposium',
 'weekly',
 'teens',
 'live',
 'march',
 'university',
 'teaching',
 '1354',
 'delinquency',
 'outside',
 'radar',
 'doctoral',
 'srcd',
 'busy',
 'recent']