In [3]:
import pandas as pd
import numpy as np

import re
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words as nltk_words
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize

import spacy
import en_core_web_sm

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saxen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saxen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saxen\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\saxen\AppData\Roaming\nltk_data...
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\saxen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\saxen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [4]:
nlp = en_core_web_sm.load()

# Hypothesis:
- remove all the punctuation and stopwords 
- remove all the punctuation, stopwords and lemmatize
-------------- 
_potential problem is that number of keywords is pretty high. Can tackle that in 2 ways:
split the data into 2 data points with random sampling, or just by half, or using manual labelling. Or just drop random sub-set of words, or not random_
- named entity extraction
--------------
_potential problem, same entity for many outputs, e.g. ('War in Ukraine: Taking cover in a town under attack',
 'Jeremy Bowen was on the frontline in Irpin, as residents came under Russian fire while trying to flee.')
 (the only entity at the beginning is Ukraine). When keep only >= 3 entities, dataset becomes pretty small_

In [6]:
data = pd.read_csv('bbc_news.csv')
data.head(5)

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [7]:
data = data.drop_duplicates(subset=['title']).drop_duplicates(subset=['guid']).drop_duplicates(subset=['description']).reset_index(drop=True)
data.head(5)

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [8]:
STOPWORDS = stopwords.words('english')
pd.Series(STOPWORDS)

0             i
1            me
2            my
3        myself
4            we
         ...   
174     weren't
175         won
176       won't
177      wouldn
178    wouldn't
Length: 179, dtype: object

In [9]:
data.loc[100, 'title']



In [10]:
lemmatizer = WordNetLemmatizer()
sent = data.loc[100, 'title']
pos_tag(word_tokenize(sent))

[('Cost', 'NN'),
 ('of', 'IN'),
 ('living', 'NN'),
 (':', ':'),
 ('UK', 'NNP'),
 ('faces', 'VBZ'),
 ('biggest', 'JJS'),
 ('income', 'NN'),
 ('squeeze', 'NN'),
 ('in', 'IN'),
 ('nearly', 'RB'),
 ('50', 'CD'),
 ('years', 'NNS')]

In [11]:
def transform(sentence):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word, tag in pos_tag(word_tokenize(sentence)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
        else:
            lemma = lemmatizer.lemmatize(word, wntag)
        lemmas.append(lemma)
    return ' '.join(lemmas)

In [12]:
def clear(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(f'[{re.escape(string.punctuation)}]', '', sentence)
    sentence = re.sub(r'\s+0+[0-9]*', '', sentence)
    return ' '.join(filter(lambda w: w not in STOPWORDS, nltk.tokenize.word_tokenize(sentence)))

In [13]:
def remove_artifacts(sentence):
    sentence = re.sub(r'\s*[0-9]+', '', sentence)
    return sentence

In [14]:
transform(clear(data.loc[100, 'title']))

'cost living warn uk face biggest income squeeze nearly 50 year'

In [15]:
doc = nlp(data.loc[100, 'title'])
[(X.text, X.label_) for X in doc.ents]

[('UK', 'GPE'), ('nearly 50 years', 'DATE')]

# Punctuation and stopwords

In [16]:
data['combined'] = data['title'] + ' ' + data['description']
data['combined']

0        Ukraine: Angry Zelensky vows to punish Russian...
1        War in Ukraine: Taking cover in a town under a...
2        Ukraine war 'catastrophic for global food' One...
3        Manchester Arena bombing: Saffie Roussos's par...
4        Ukraine conflict: Oil price soars to highest l...
                               ...                        
32053    Bell leads England to ODI clean sweep over New...
32054    Murrays and all-British battles headline day f...
32055    Highlights: Raducanu dominant in win over Mert...
32056    Chasing the 'Ghost' - a superstar struck down ...
32057    Le Pen's far-right at the gates of power The N...
Name: combined, Length: 32058, dtype: object

In [17]:
data_ps = pd.DataFrame()
data_ps['combined'] = data['combined'].apply(clear)
data_ps

Unnamed: 0,combined
0,ukraine angry zelensky vows punish russian atr...
1,war ukraine taking cover town attack jeremy bo...
2,ukraine war catastrophic global food one world...
3,manchester arena bombing saffie roussoss paren...
4,ukraine conflict oil price soars highest level...
...,...
32053,bell leads england odi clean sweep new zealand...
32054,murrays allbritish battles headline day four a...
32055,highlights raducanu dominant win mertens emma ...
32056,chasing ghost superstar struck lightning john ...


In [18]:
data_ps['combined'].apply(lambda x: len(x.split(' '))).describe()

count    32058.000000
mean        18.601940
std          4.098361
min          4.000000
25%         16.000000
50%         18.000000
75%         20.000000
max         41.000000
Name: combined, dtype: float64

In [19]:
data_ps.to_csv('punctuation_stopwords.csv', index=False)

# Punctuation, stopwords and lemmatization

In [20]:
data_psl = pd.DataFrame()
data_psl['combined'] = data['combined'].apply(clear).apply(transform)
data_psl

Unnamed: 0,combined
0,ukraine angry zelensky vow punish russian atro...
1,war ukraine take cover town attack jeremy bowe...
2,ukraine war catastrophic global food one world...
3,manchester arena bomb saffie roussoss parent h...
4,ukraine conflict oil price soar highest level ...
...,...
32053,bell lead england odi clean sweep new zealand ...
32054,murray allbritish battle headline day four and...
32055,highlight raducanu dominant win mertens emma r...
32056,chase ghost superstar strike lightning john wh...


In [21]:
data_psl.to_csv('punctuation_stopwords_lemmatization.csv', index=False)

# Named Entity Extraction

In [22]:
data_nee = pd.DataFrame()
data_nee['combined'] = data['combined'].apply(lambda x: ' '.join([e.text for e in nlp(x).ents])).apply(remove_artifacts)
data_nee

Unnamed: 0,combined
0,Ukraine Angry Zelensky Russian Ukrainian
1,Ukraine Jeremy Bowen Irpin Russian
2,Ukraine One
3,Manchester Arena Roussos Manchester Arena
4,Ukraine
...,...
32053,Bell England New Zealand Lauren Bell England N...
32054,British Andy Murray Wimbledon Thursday Jamie t...
32055,Mertens Emma Raducanu- Elise Mertens third Wim...
32056,John White the Cup Winners Cup


In [23]:
data_nee = data_nee[data_nee['combined'].apply(lambda x: len(x.split(' '))) >= 2]
data_nee

Unnamed: 0,combined
0,Ukraine Angry Zelensky Russian Ukrainian
1,Ukraine Jeremy Bowen Irpin Russian
2,Ukraine One
3,Manchester Arena Roussos Manchester Arena
5,Ukraine PM Boris Johnson Canadian Dutch
...,...
32053,Bell England New Zealand Lauren Bell England N...
32054,British Andy Murray Wimbledon Thursday Jamie t...
32055,Mertens Emma Raducanu- Elise Mertens third Wim...
32056,John White the Cup Winners Cup


In [24]:
cv = CountVectorizer(analyzer='word')
cv_data = cv.fit_transform(data_nee['combined'])
dtm_data = pd.DataFrame(cv_data.toarray(), columns=cv.get_feature_names_out())
dtm_data.index = data_nee.index
dtm_data.head(5)

Unnamed: 0,aa,aaa,aaliyah,aamir,aanoch,aardman,aaron,aarons,aarwangen,aastha,...,zuu,zuwara,zverev,zwelithini,zwerner,zúñiga,élysée,élysées,úsuga,šefčovič
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
data_nee.to_csv('named_entity_extraction.csv', index=False)