## News article scraping from eyefootball.com

In [2]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import urllib.request as rs
import os

In [2]:
def extract_history(year, month):
    url = "https://www.eyefootball.com/archive/?monthly=" + str(year) + "-" + str(month).zfill(2)
    print(url)
    fifa = rs.urlopen(url)
    text = fifa.read()
    soup = bs(text)
    news_url = []
    for t in soup.find_all('a'):
        if t.attrs['href'].find('/news/') > -1:
            news_url.append('https://www.eyefootball.com/' + t.attrs['href'])
    print('Total news articles to be extracted from', url)
    print(len(news_url))
    count = 0
    for i in news_url:
        news_article_uri = rs.urlopen(i)
        news_url_text = news_article_uri.read()
        news_soup = bs(news_url_text)
        article_text = ''
        for j in news_soup.find_all('p'):
            try:
                if j.attrs['style'].find('font-size') > -1:
                    article_text = j.get_text().replace('\n', '')
            except:
                None
        directory = 'D:\\Academics\\UMN-MSBA\\Term 3\\MSBA 6460 - Advanced AI\\Project 1\\articles\\' + \
                         str(year) + \
                         str(month).zfill(2) + \
                         '\\'
        if not os.path.exists(directory):
            os.makedirs(directory)
        text_file = open(directory +
                         str(count) + '.txt',
                         "w")
        text_file.write(article_text)
        text_file.close()
        count += 1

In [6]:
extraction_months = [(2020, 9),
                    (2020, 10),
                    (2020, 11),
                    (2020, 12),
                    (2021, 1),
                    (2021, 2),
                    (2021, 3),
                    (2021, 4)]

for i in extraction_months:
    extract_history(i[0], i[1])

https://www.eyefootball.com/archive/?monthly=2021-02
Total news articles to be extracted from https://www.eyefootball.com/archive/?monthly=2021-02
210
https://www.eyefootball.com/archive/?monthly=2021-03
Total news articles to be extracted from https://www.eyefootball.com/archive/?monthly=2021-03
231
https://www.eyefootball.com/archive/?monthly=2021-04
Total news articles to be extracted from https://www.eyefootball.com/archive/?monthly=2021-04
226


## Player Name Extraction

In [3]:
import nltk
from tqdm import tqdm
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ravih\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ravih\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\ravih\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ravih\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
print('Reference:: https://unbiased-coder.com/extract-names-python-nltk/')
from os import listdir
from os.path import isfile, join

directory = 'D:\\Academics\\UMN-MSBA\\Term 3\\MSBA 6460 - Advanced AI\\Project 1\\'
result = [os.path.join(dp, f) 
          for dp, dn, filenames in os.walk(directory) 
          for f in filenames if os.path.splitext(f)[1] == '.txt']
finalList = []
for i in tqdm(result):
    with open(i) as f:
        lines = f.readlines()
    nltk_results = ne_chunk(pos_tag(word_tokenize(lines[0])))
    # print(len(nltk_results))
    for nltk_result in nltk_results:
        if type(nltk_result) == Tree:
            name = ''
            for nltk_result_leaf in nltk_result.leaves():
                name += nltk_result_leaf[0] + ' '
            # print ('Type: ', nltk_result.label(), 'Name: ', name)
            if nltk_result.label() == 'PERSON':
                finalList.append([i, name.strip()])

Reference:: https://unbiased-coder.com/extract-names-python-nltk/


100%|██████████████████████████████████████████████████████████████████████████████| 1874/1874 [03:20<00:00,  9.35it/s]


In [5]:
dataDF = pd.DataFrame(finalList, columns = ['news_file', 'player_name'])
dataDF.to_csv('player_names_for_news.csv')

## Filtering out Non-Players

In [24]:
playerNames = pd.read_csv('playerNames.csv')
print(playerNames.columns)
playerNames['LastName'] = playerNames['LastName'].str.strip()
playerNames['FirstName'] = playerNames['FirstName'].str.strip()
playerNames

Index(['LastName', 'FirstName'], dtype='object')


Unnamed: 0,LastName,FirstName
0,Akinola,Timothy Olaoluwa
1,Alebiosu,Ryan
2,Alves Soares,Cedric Ricardo
3,Aubameyang,Pierre-Emerick
4,Awe,Zachariah Olumide
...,...,...
1413,Kandola,Kamran
1414,Lembikisa,Dexter Joeng Woo
1415,Pinnington,Dean Stanley
1416,Roberts,Tyler


In [18]:
dataDF.size

23396

In [32]:
final_data = dataDF[dataDF['player_name'].isin(playerNames['LastName']) | 
       dataDF['player_name'].isin(playerNames['LastName']) | 
       dataDF['player_name'].isin(playerNames['LastName'] + ' ' + playerNames['LastName'])]

In [35]:
print('total players in our data set', final_data['player_name'].nunique())
print('total news articles in our data set', final_data['player_name'].nunique())

143

## Labeling the data

In [50]:
transferedPlayers = pd.read_csv('Transfered_players.csv')
transferedPlayers

Unnamed: 0,FirstName,LastName
0,Nuno,Tavares
1,Albert,Sambi
2,Mika,Biereth
3,Ben,White
4,Martin,Odegaard
...,...,...
596,Austin,Samuels
597,Luke,Matheson
598,Connor,Ronan
599,Andreas,Sondergaard


In [56]:
final_data['Transfered'] = final_data['player_name'].isin(transferedPlayers['FirstName']) | \
       final_data['player_name'].isin(transferedPlayers['LastName']) |  \
       final_data['player_name'].isin(transferedPlayers['LastName'] + ' ' + transferedPlayers['LastName'])
final_data['Transfered'] = final_data['Transfered'].apply(lambda x: 1 if x else 0)
sum(final_data['Transfered'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['Transfered'] = final_data['player_name'].isin(transferedPlayers['FirstName']) | \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['Transfered'] = final_data['Transfered'].apply(lambda x: 1 if x else 0)


408

## Grouping the data

In [71]:
player_news_set = final_data.groupby('player_name')['news_file'].apply(list).reset_index()
player_transfer_set = final_data.groupby('player_name')['Transfered'].apply(sum).reset_index()
merged_data = player_news_set.merge(player_transfer_set, how = 'inner', on = 'player_name')
merged_data['Transfered'] = merged_data['Transfered'].apply(lambda x: 0 if x == 0 else 1)

In [73]:
merged_data.to_csv('player_news_set.csv')