In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset


ds = load_dataset("lucadiliello/newsqa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers', 'key', 'labels'],
        num_rows: 74160
    })
    validation: Dataset({
        features: ['context', 'question', 'answers', 'key', 'labels'],
        num_rows: 4212
    })
})

In [3]:

df_train = pd.DataFrame(ds["train"])
df_val = pd.DataFrame(ds["validation"])


In [4]:
df_train

Unnamed: 0,context,question,answers,key,labels
0,"NEW DELHI, India (CNN) -- A high court in nort...",What was the amount of children murdered?,[19],da0e6b66e04d439fa1ba23c32de07e50,"[{'end': [295], 'start': [294]}]"
1,"NEW DELHI, India (CNN) -- A high court in nort...",When was Pandher sentenced to death?,[February.],724f6eb9a2814e4fb2d7d8e4de846073,"[{'end': [269], 'start': [261]}]"
2,"NEW DELHI, India (CNN) -- A high court in nort...",The court aquitted Moninder Singh Pandher of w...,[rape and murder],d64cbb90e5134081acfa83d3e702408c,"[{'end': [638], 'start': [624]}]"
3,"NEW DELHI, India (CNN) -- A high court in nort...",who was acquitted,[Moninder Singh Pandher],fd7177ee6f1f4d62becd983a0305f503,"[{'end': [216], 'start': [195]}]"
4,"NEW DELHI, India (CNN) -- A high court in nort...",who was sentenced,[Moninder Singh Pandher],cd25c69f631349748ccdeccaace66463,"[{'end': [216], 'start': [195]}]"
...,...,...,...,...,...
74155,"OAKLAND, California (CNN) -- Fifth-grader Chri...",What happened to Christopher Rodriguez?,"[was hit by a stray bullet, paralyzing him for...",c0ac3ef6afb94dbe8e1666b8bfbf5237,"[{'end': [259], 'start': [209]}]"
74156,"OAKLAND, California (CNN) -- Fifth-grader Chri...",What did Christopher Rodriguez love?,[music.],683cfaf6ec1c47189172300b4aaa3f91,"[{'end': [985], 'start': [980]}]"
74157,"OAKLAND, California (CNN) -- Fifth-grader Chri...",WIll the boy walk again?,[paralyzed for life],0da315692fd04023b7205fab7aeeb26e,"[{'end': [701], 'start': [684]}]"
74158,"OAKLAND, California (CNN) -- Fifth-grader Chri...",What did the suspect allegedly rob?,[Chevron gas station],74a4fa2d8548463f8fdf9a370d7ea5ff,"[{'end': [423], 'start': [405]}]"


# **TEXT PREPROCESSING AND TOKENIZATION**

In [5]:
df_train['text'] = df_train['context'] + df_train['question']
df_train["text"] = df_train["text"].apply(str.lower)

tokenization and stopwords removal

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt_tab')
from nltk import word_tokenize

stop = set(stopwords.words('english'))
stop.update(['.', ',', '!', '?', ';', ':', '-', '--', '(', ')', '[', ']', '{', '}',
    '"', "'", '’', '“', '”', '…', '—', '/', '\\', '|', '@', '#', '$', '%',
    '^', '&', '*', '_', '+', '=', '<', '>', '~', '`'])

def tokenize(text):
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word not in stop]
  return tokens

df_train['tokenized_text'] = df_train['text'].apply(tokenize)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
df_train['tokenized_text'][0]



['new',
 'delhi',
 'india',
 'cnn',
 'high',
 'court',
 'northern',
 'india',
 'friday',
 'acquitted',
 'wealthy',
 'businessman',
 'facing',
 'death',
 'sentence',
 'killing',
 'teen',
 'case',
 'dubbed',
 '``',
 'house',
 'horrors',
 "''",
 'moninder',
 'singh',
 'pandher',
 'sentenced',
 'death',
 'lower',
 'court',
 'february',
 'teen',
 'one',
 '19',
 'victims',
 'children',
 'young',
 'women',
 'one',
 'gruesome',
 'serial',
 'killings',
 'india',
 'recent',
 'years',
 'allahabad',
 'high',
 'court',
 'acquitted',
 'moninder',
 'singh',
 'pandher',
 'lawyer',
 'sikandar',
 'b.',
 'kochar',
 'told',
 'cnn',
 'pandher',
 'domestic',
 'employee',
 'surinder',
 'koli',
 'sentenced',
 'death',
 'february',
 'lower',
 'court',
 'rape',
 'murder',
 '14-year-old',
 'high',
 'court',
 'upheld',
 'koli',
 "'s",
 'death',
 'sentence',
 'kochar',
 'said',
 'two',
 'arrested',
 'two',
 'years',
 'ago',
 'body',
 'parts',
 'packed',
 'plastic',
 'bags',
 'found',
 'near',
 'home',
 'noida',
 '

lemmatization

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize(text):
  pos_tags = pos_tag(text)
  return [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
df_train['tokenized_text'] = df_train['tokenized_text'].apply(lemmatize)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [10]:
df_train['tokenized_text'][0]


['new',
 'delhi',
 'india',
 'cnn',
 'high',
 'court',
 'northern',
 'india',
 'friday',
 'acquit',
 'wealthy',
 'businessman',
 'face',
 'death',
 'sentence',
 'kill',
 'teen',
 'case',
 'dub',
 '``',
 'house',
 'horror',
 "''",
 'moninder',
 'singh',
 'pandher',
 'sentence',
 'death',
 'lower',
 'court',
 'february',
 'teen',
 'one',
 '19',
 'victim',
 'child',
 'young',
 'woman',
 'one',
 'gruesome',
 'serial',
 'killing',
 'india',
 'recent',
 'year',
 'allahabad',
 'high',
 'court',
 'acquit',
 'moninder',
 'singh',
 'pandher',
 'lawyer',
 'sikandar',
 'b.',
 'kochar',
 'tell',
 'cnn',
 'pandher',
 'domestic',
 'employee',
 'surinder',
 'koli',
 'sentence',
 'death',
 'february',
 'lower',
 'court',
 'rape',
 'murder',
 '14-year-old',
 'high',
 'court',
 'uphold',
 'koli',
 "'s",
 'death',
 'sentence',
 'kochar',
 'say',
 'two',
 'arrest',
 'two',
 'year',
 'ago',
 'body',
 'part',
 'pack',
 'plastic',
 'bag',
 'find',
 'near',
 'home',
 'noida',
 'new',
 'delhi',
 'suburb',
 'hom

In [11]:
df_train[['tokenized_text']].to_csv('tokenized_only.csv', index=False)


In [3]:
tokenized_text = pd.read_csv('tokenized_only.csv')

In [5]:
corpus = tokenized_text['tokenized_text'].tolist()


In [18]:
import ast

# Convert stringified lists back to Python lists
tokenized_text["tokenized_text"] = tokenized_text["tokenized_text"].apply(ast.literal_eval)

# Verify
print(type(tokenized_text["tokenized_text"].iloc[0]))
print(tokenized_text["tokenized_text"].iloc[0][:20])


<class 'list'>
['new', 'delhi', 'india', 'cnn', 'high', 'court', 'northern', 'india', 'friday', 'acquit', 'wealthy', 'businessman', 'face', 'death', 'sentence', 'kill', 'teen', 'case', 'dub', '``']


In [20]:
corpus = tokenized_text['tokenized_text'].tolist()

In [21]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import multiprocessing

class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training progress'''
    def __init__(self):
        self.epoch = 0
    def on_epoch_begin(self, model):
        print(f"Epoch #{self.epoch} start")
    def on_epoch_end(self, model):
        print(f"Epoch #{self.epoch} end\n")
        self.epoch += 1

model = Word2Vec(
    sentences=corpus,
    vector_size=200,
    window=5,
    min_count=3,
    workers=multiprocessing.cpu_count() - 1,
    sg=0,
    epochs=10,
    compute_loss=True,
    callbacks=[EpochLogger()]
)

Epoch #0 start
Epoch #0 end

Epoch #1 start
Epoch #1 end

Epoch #2 start
Epoch #2 end

Epoch #3 start
Epoch #3 end

Epoch #4 start
Epoch #4 end

Epoch #5 start
Epoch #5 end

Epoch #6 start
Epoch #6 end

Epoch #7 start
Epoch #7 end

Epoch #8 start
Epoch #8 end

Epoch #9 start
Epoch #9 end



In [22]:
model.save("word2vec_model.model")

In [23]:
print(len(model.wv))               # total unique words in vocab
print(list(model.wv.index_to_key[90:120]))  # first 20 words

94925
['thursday', 'member', 'wednesday', 'still', 'try', 'national', 'company', 'release', 'area', 'place', 'percent', 'issue', 'international', 'security', 'good', 'ask', 'news', 'lead', 'become', 'school', 'former', 'start', 'live', 'another', 'end', 'begin', 'much', 'plan', 'hold', 'believe']


In [24]:
model.wv.most_similar("court")
model.wv.most_similar("murder")

[('slay', 0.576947033405304),
 ('manslaughter', 0.5182812213897705),
 ('kidnap', 0.5176623463630676),
 ('acquit', 0.5035039186477661),
 ('felony', 0.49729272723197937),
 ('convict', 0.495853990316391),
 ('killing', 0.4890891909599304),
 ('robbery', 0.4834452271461487),
 ('first-degree', 0.47746261954307556),
 ('stab', 0.47710496187210083)]

In [25]:
top_words = model.wv.index_to_key[:50]
word_vectors = [model.wv[word] for word in top_words]
df_embeddings = pd.DataFrame(word_vectors, index=top_words)

In [26]:
df_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
``,-0.36117,1.516741,1.028939,-1.3771,0.133204,0.522527,-0.418094,-2.46211,-1.892542,0.150559,...,-0.888105,0.070932,1.560771,0.705938,1.579257,-2.039714,-0.627125,0.829076,-0.717064,-0.557131
'',-0.2553,1.681324,0.662874,-1.420188,0.212264,0.052463,0.254534,-2.108332,-1.171206,0.943193,...,-1.363351,0.496121,0.136198,0.821429,2.130521,-2.058803,-1.264767,1.741799,-1.209875,-0.077624
say,0.141538,0.835803,0.237558,0.390784,-0.331544,1.214197,1.792119,-2.567497,-0.901627,0.175093,...,-0.239709,0.761795,0.317263,0.595268,1.313167,-2.014267,-1.155276,2.193691,-0.706406,-1.379522
's,0.585275,0.84989,1.037752,-0.07872,0.097959,-0.196285,0.838804,-0.354621,-0.636093,-0.895439,...,-1.369237,-0.325578,-0.529117,0.273979,-0.187799,-1.788205,-0.870013,1.917431,0.319509,-2.405939
cnn,0.812658,0.41865,1.880888,2.932276,0.506234,-0.704398,0.090168,0.152963,-1.706315,-0.413297,...,-0.070025,1.803373,-0.532503,-1.667773,2.033447,-0.576031,-1.554168,1.951517,-0.095915,-3.200901
people,-0.211461,2.086711,-0.211525,-4.16775,0.796409,1.430381,-2.710118,-5.342401,-0.746095,-2.399751,...,0.434005,-0.148265,0.577547,1.304122,0.330665,0.683191,-1.4468,-0.116551,-0.089309,1.464846
year,-2.659751,2.238245,0.089244,0.559302,0.760954,1.249498,-0.794953,-1.581326,3.069119,-1.556584,...,-0.333062,0.713334,2.620226,0.045591,-2.963086,0.472321,3.341413,3.419948,-0.298975,3.581706
one,-0.377224,3.027302,1.717523,-0.653973,-0.208785,-1.154179,-0.99795,-1.992374,-1.198942,-0.85864,...,-1.708911,0.082929,0.560814,-0.050011,-1.132393,-0.174504,0.300299,1.783088,0.142857,-1.135865
n't,-1.631325,0.684061,1.639793,-2.537292,2.135305,0.90956,-2.159183,-2.868125,-1.997759,1.238164,...,-0.15938,-0.962293,1.497992,-0.058427,4.060013,-2.773873,-1.8447,1.191974,-1.028937,0.243959
would,-0.891937,0.673722,-0.224039,-2.313442,1.807164,-0.119215,-1.131726,-2.148105,-1.765224,-1.265452,...,-1.754354,-0.80781,0.559151,1.635334,-0.408908,-3.92958,-1.437317,-0.126264,-1.295444,1.448561


In [27]:
df_embeddings.to_csv("top50_word_embeddings.csv")