# Extracting news articles from the official UFC website

In [281]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import string
import nltk

In [308]:
nltk.download(['stopwords', 'wordnet'])

[nltk_data] Downloading package stopwords to /home/sanjiv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sanjiv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Web scraping

In [283]:
article_num = 12
news_links = []
link_pattern = re.compile("^/news/.*")

In [284]:
# getting urls for the indivdual news' pages

p = 0
while len(news_links) < article_num:
  r = requests.get('https://www.ufc.com/trending/all?filters%5B0%5D=type%3Aarticle&created=1&page={}'.format(p))
  soup = BeautifulSoup(r.content, 'html.parser')
  news_cards = [ele.find('a') for ele in soup.find(class_='views-element-container').find_all(class_='grid-item')]
  for n in news_cards:
    if n is not None:
      link = n.attrs['href']
      if link_pattern.match(link):
        news_links.append(link)
  p += 1

In [285]:
len(news_links)

14

In [286]:
# scraping content from each article page

articles = []

for lnk in news_links[:article_num]:

  r = requests.get('https://www.ufc.com{}'.format(lnk))
  soup = BeautifulSoup(r.content, 'html.parser')
  article = dict()

  article_category = soup.find('div', class_='field--name-category')
  if article_category is not None:
    categories = article_category.find_all('div', class_='field__item')
    article['categories'] = [c.get_text() for c in categories]
  else:
    article['categories'] = []

  article['title'] = soup.find('div', class_='field--name-node-title')
  article['title'] =  '' if article['title'] is None else article['title'].get_text()

  article['body'] = soup.find('div', class_='field--name-body-structured')
  article['body'] =  '' if article['body'] is None else article['body'].get_text()

  article_tags = soup.find('div', class_='field__items l-flex--2col-1to2')
  if article_tags is not None:
    tags = article_tags.find_all('div', class_='e-link--tag')
    article['tags'] = [t.get_text() for t in tags]
  else:
    article['tags'] = []

  articles.append(article)

## Text cleaning

In [287]:
df = pd.DataFrame.from_dict(articles)

In [288]:
# No character encoding issues seen in the texts
df.head()

Unnamed: 0,categories,title,body,tags
0,[Athletes],\n SERGHEI SPIVAC IS ALL FOCUS\n\n,\n \nThe 28-year-old “Polar Bear” from Moldova...,[]
1,[Dana White's Contender Series],\n Week 4 Preview | Dana White's Contender Se...,\n \nLast week marked the first episode of the...,[\n\n card preview\n \n]
2,[Fight Coverage],\n Fight by Fight Preview | UFC Fight Night: ...,"\n \nSimilar to last September, French heavywe...","[\n\n UFC Paris\n \n, \n\n ..."
3,[Results],\n Road To UFC Final Results & Scorecards | S...,\n \nHere's how it all went down Sunday mornin...,"[\n\n Road To UFC\n \n, \n\n ..."
4,[Road To UFC],\n Updates To Road To UFC Season 2 \n\n,\n \nHaraguchi will now fight Jae Hyun Park in...,[\n\n Road To UFC\n \n]


In [289]:
'''
  remove escape sequences,
  strip leading and trailing whitespaces
'''
def escape_n_strip(s):
  escapes = '\b\n\r\t\\' 
  for c in escapes: 
    s = s.replace(c, ' ')
  return s.strip()

# shrink large whitespaces into one
def shrink_space(s):
  return re.sub('\s+', ' ', s)

# find more unwanted non alphanumeric characters
def find_impurities(s):
  aliens = set()
  for c in s:
    if not c.isalnum() and c not in string.punctuation:
      aliens.add(c)
  return aliens

# remove numbers
def remove_numbers(s):
  return re.sub('\d+', ' ', s)

# remove words in list 'r' from string 's' 
def remove_words(s, r):
  words = s.split(" ")
  words = list( filter(lambda x: False if x in r else True, words) )
  return " ".join(words)

In [290]:
df.columns

Index(['categories', 'title', 'body', 'tags'], dtype='object')

In [291]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   categories  12 non-null     object
 1   title       12 non-null     object
 2   body        12 non-null     object
 3   tags        12 non-null     object
dtypes: object(4)
memory usage: 512.0+ bytes


In [292]:
# remove escape characters

# text categories
for col in ['title', 'body']:
  df[col] = df[col].apply(escape_n_strip)
  df[col] = df[col].apply(shrink_space)

# list categories
for col in ['categories', 'tags']:
  newlist = []
  for vals in df[col]:
    nvals = []
    for v in vals:
      nvals.append( shrink_space( escape_n_strip(v) ) )
    newlist.append(nvals)
  df[col] = newlist

In [293]:
df.head()

Unnamed: 0,categories,title,body,tags
0,[Athletes],SERGHEI SPIVAC IS ALL FOCUS,The 28-year-old “Polar Bear” from Moldova drop...,[]
1,[Dana White's Contender Series],Week 4 Preview | Dana White's Contender Series...,Last week marked the first episode of the seas...,[card preview]
2,[Fight Coverage],Fight by Fight Preview | UFC Fight Night: Gane...,"Similar to last September, French heavyweight ...","[UFC Paris, Ciryl Gane, Serghei Spivac, Manon ..."
3,[Results],Road To UFC Final Results & Scorecards | Seaso...,Here's how it all went down Sunday morning. Wa...,"[Road To UFC, results, scorecards]"
4,[Road To UFC],Updates To Road To UFC Season 2,Haraguchi will now fight Jae Hyun Park in tomo...,[Road To UFC]


In [294]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [295]:
# find other strange characters

strangers = df['body'].apply(find_impurities).to_list()
strangers = list(set().union(*strangers))
if ' ' in strangers:
  strangers.remove(' ')
strangers

['💥',
 '️',
 '🇫',
 '“',
 '😳',
 '…',
 '’',
 '☠',
 '\u200d',
 '—',
 '‘',
 '🇷',
 '🏴',
 '–',
 '”']

In [296]:
# pattern for all punctuations to remove

punctuations =  "|".join([re.escape(sym) for sym in list(string.punctuation) + strangers])
punctuations

'!|"|\\#|\\$|%|\\&|\'|\\(|\\)|\\*|\\+|,|\\-|\\.|/|:|;|<|=|>|\\?|@|\\[|\\\\|\\]|\\^|_|`|\\{|\\||\\}|\\~|💥|️|🇫|“|😳|…|’|☠|\u200d|—|‘|🇷|🏴|–|”'

In [297]:
df['body'] = df['body'].apply(lambda s: re.sub(punctuations, ' ', s))
df['body'] = df['body'].apply(shrink_space)

In [298]:
# remove numbers

df['body'] = df['body'].apply(remove_numbers)
df['body'] = df['body'].apply(shrink_space)

In [299]:
# lower case

df['body'] = df['body'].apply(lambda x: x.lower())

In [300]:
# remove stopwords

eng_stops = nltk.corpus.stopwords.words('english')
df['body'] = df['body'].apply(lambda x: remove_words(x, eng_stops))
df['body'] = df['body'].apply(shrink_space)

In [312]:
# lemmatization

lemmatizer = nltk.stem.WordNetLemmatizer()
df['body'] = df['body'].apply( lambda x: " ".join([lemmatizer.lemmatize(c) for c in x.split(" ")]) )

## View clean text & save

In [313]:
df.iloc[3, 2][:500]

'went sunday morning watch ufc fight pas watch road ufc country road ufc official result nyamjargal tumendemberel defeat peter danesoe submission rear naked choke round rei tsuruya defeat mark climaco unanimous decision yizha defeat sangwon kim unanimous decision jiniushiyue defeat seung guk choi unanimous decision kaiwen defeat koya kanda unanimous decision xiao long defeat shuya kamikubo majority decision rongzhu defeat sang uk kim unanimous decision changho lee defeat daermisi zhawupasi tko ro'

In [302]:
catlist = df.categories.to_list()
unq_cats = set()
for c in catlist:
  for c1 in c:
    unq_cats.add(c1)
unq_cats

{'Athletes',
 "Dana White's Contender Series",
 'Fight Coverage',
 'Results',
 'Road To UFC',
 'UFC Fight Pass',
 'Weigh-in'}

In [303]:
df.title.unique()

array(['SERGHEI SPIVAC IS ALL FOCUS',
       "Week 4 Preview | Dana White's Contender Series Season 7",
       'Fight by Fight Preview | UFC Fight Night: Gane vs Spivac',
       'Road To UFC Final Results & Scorecards | Season 2 Semi-Finals',
       'Updates To Road To UFC Season 2',
       'Weigh-In Results | Road To UFC Asia: Episodes 5 & 6',
       'Anthony Smith Enjoys Getting Out Of His Comfort Zone',
       'Fight By Fight Preview | UFC Fight Night: Holloway vs The Korean Zombie',
       'September Episodes Dana White’s Contender Series Fights Revealed',
       'Waldo Cortes Acosta: ‘I’m Ready To Put Him Away’',
       'Rinya Nakamura Believes His Rise Has Only Begun',
       'Victoria Anthony Looks To Capitalize On Rare Opportunity | UFC FIGHT PASS'],
      dtype=object)

In [304]:
taglist = df.tags.to_list()
unq_tags = set()
for t in taglist:
  for t1 in t:
    unq_tags.add(t1)
unq_tags

{'Ciryl Gane',
 "Dana White's Contender Series",
 'Fight By Fight',
 'Korean Zombie',
 'Manon Fiorot',
 'Max Holloway',
 'Official Weigh-In',
 'Road To UFC',
 'Rose Namajunas',
 'Serghei Spivac',
 'Singapore',
 'UFC Paris',
 'UFC Singapore',
 'card preview',
 'results',
 'scorecards'}

In [305]:
df.to_csv('../../datasets/ufc_trends.csv', index=False)