In [34]:
import networkx as nx
from bs4 import BeautifulSoup
import re
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import requests
import os
import numpy as np

In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [5]:
dataYears = ["18", "17", "16", "15", "14", "13"]
dataMonths = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
dataMonths2Dig = ["01", "02", "03", "04", "05", "06", "07", "08", "09","10","11","12"]
dataPath = Path("data")

In [None]:
#from txt to csv, eliminating unnecessary data, after this piece of code is executed, the data for months is situated in aptly named folders dec13, jan14 etc.

def dataPreprocessing():
    dataMonthsRev = list(reversed(dataMonths)) # reversing the list because pre august 2016 data had a different organisation within the file
    changeInDataset = False
    ratingLimit = 2000

    for year in dataYears:
        for month in dataMonthsRev:
            folderName = month + year
            if folderName == "aug16":
                changeInDataset = True

            fileName = "standard_" + folderName + "frl.txt"
            with open(dataPath/folderName/fileName, 'r') as myfile:
                with open(dataPath/folderName/(folderName + ".csv"), 'w') as writeFile:
                    people = []
                    lines = myfile.readlines()
                    for line in tqdm(lines):
                        id = line[:15].strip()
                        
                        if id == 'ID Number': # eliminate file headers
                            continue
                        
                        name = line[15:76].strip()
                        name = name.replace(',', ';', 10) # replace all commas in name with semicolons, because comma is a delimiter in csv file
                        
                        nat = line[76:80].strip()
            #             sex = line[80]
                        
                        if changeInDataset:
                            rating_str = line[109:115].strip()
                        else:
                            rating_str = line[113:118].strip()
                        
                        if len(rating_str) > 0:
                            rating = int(rating_str)
            #             birth = (line[152:157].strip())
                        
                        if changeInDataset:
                            activity = line[132:].strip()
                        else:
                            activity = line[129:].strip()

                        if activity.find('i') == -1 and len(rating_str) > 0 and rating > ratingLimit: # appending every entry to a list for sorting later
                            people.append((id, name, nat, rating_str))
                    
                    people.sort(key=lambda tup: int(tup[0])) # sort all people by their FIDE identification 
                    for entry in people:
                        writeFile.write(entry[0] + "," + entry[1] + "," + entry[2] + "," + entry[3] + '\n')

#ratings in the constructed txt files are for the end of the month and should be used for building the next month's graph
dataPreprocessing()

In [None]:
#legacy code for extracting using regex, obsolete

# fileName = "players_list_foa.txt"
# # fileName = "shorterlist.txt"
# dataPath = Path("data")

# df = pd.DataFrame()
# with open(dataPath/fileName, 'r') as myfile:
#     lines = myfile.readlines()
#     for line in lines:
#         rez = re.search("^(?P<id>\d+)\s*(?P<label>[ -~]*?)\s*?(?P<country>[A-Z]{3})\s([M|F])[^\d]{32}(?P<rating>\d\d\d\d)?(.*?)(\d\d\d\d)(?P<activity>[^\d]*)$", line)
#         #print("Rezultat:", rez)
#         if rez:
# #             print (rez.group('activity').strip())
#             if rez.group('rating') and int(rez.group('rating').strip()) > 1000 and rez.group('activity').find('i') == -1:
#                 df = df.append({"ID" : rez.group('id'), "label" : rez.group('label'), "country" : rez.group('country'), "rating" : rez.group('rating'), "activity" : rez.group('activity')}, ignore_index=True)
# #                 print("ID:",rez.group('id'))
# #             print("label:", rez.group('label'))
# #             print("country:", rez.group('country'))
# #             print("rating:", rez.group('rating'))
# #             print("activity:", rez.group('activity'))
            

# df.to_json('output.json')

In [None]:
# df = pd.read_json('output.json')
# df.iloc[1]

In [None]:
# df2 = pd.DataFrame()
# with open(dataPath/Path('jun17')/Path('standard_jun17frl.txt')) as myFile:
#     lines = myFile.readlines()
#     lines = lines[1:]
#     for line in lines:
#         id = int(line[0:line.find(' ')])
#         if df['label'].where(df['ID'] == id)[0] != 'nan':
#             rez = re.search("^(?P<id>\d+)\s*(?P<label>[ -~]*?)\s*?(?P<country>[A-Z]{3})\s([M|F])[^\d]{32}(?P<rating>\d\d\d\d)?(.*?)(\d\d\d\d)(?P<activity>[^\d]*)$", line)
#             if rez:
#                 df2 = df2.append({"ID" : rez.group('id'), "label" : rez.group('label'), "country" : rez.group('country'), "rating" : rez.group('rating'), "activity" : rez.group('activity')}, ignore_index=True)

In [6]:
df = pd.read_csv(Path(dataPath/'apr18'/'apr18.csv'), header=None)

In [7]:
#graph for May 2018
G = nx.Graph()

In [8]:
for idx in range(len(df)):
    G.add_node(str(df.iloc[idx][0]))

In [9]:
nx.write_gml(G, "test.gml")

In [10]:

#fetching matches of a particular player for the specified month

#id = FIDE id, date = string in yyyy-mm-01 format, returns adjacency list in form ("surname, name", "FIDE rating")
def fetchMatches(id, date):
    targetLink = "https://ratings.fide.com/individual_calculations.phtml?idnumber="+str(id)+"&rating_period="+date
    #page = requests.get("https://ratings.fide.com/individual_calculations.phtml?idnumber=2020009&rating_period=2015-11-01")
    page = requests.get(targetLink)
    soup = BeautifulSoup(page.content, 'html.parser')
    adjacencyList = []
    #print(soup.prettify())

    tagList = soup.findAll("td", {"class": "list4"})
    for tag in tagList:
        if tag.img:
            tag.img.decompose()
        #print("Tag: ", tag)
        #print("Children: ")
        cnt = 1
        for child in tag.children:
            #print("Child ", cnt, ": ", child)
            if cnt==1:
                name = child.strip()
            if cnt==4:
                rating = child.contents[0].strip()
            if cnt==11:
                adjacencyList.append((name,rating))
            cnt= cnt+1
    return adjacencyList
    
#print(fetchMatches(2020009, "2018-04-01"))


In [11]:
#example: for fetching for month = June 2018, use rating from the website for June 2018 and dataFrame for May 2018; rating is string. not int
def getIdByNameAndRating(surnameNameString, rating, dataFrame): 
    dataFrameSameName = dataFrame.loc[dataFrame[1] == surnameNameString.replace(',', ';')]
    if len(dataFrameSameName) > 0:
        dataFrameSameNameSameRating = dataFrameSameName.loc[dataFrame[3] == int(rating)]
        if len(dataFrameSameNameSameRating) == 1:
            return dataFrameSameNameSameRating.iloc[0][0]
    return -1


def getNameById(id, dataFrame):
    dataFrameSameId = dataFrame.loc[dataFrame[0] == id]
    if len(dataFrameSameId) == 1:
        return dataFrameSameId.iloc[0][1].replace(';', ',')
    return None

#print(getIdByNameAndRating('So, Wesley', 2786, df))
#print(getNameById(5202213, df))

In [12]:
#date: yyyy-mm-01
def getPrevMonth(date):
    year , month , _ = date.split("-")
    if month == '01':
        return "dec"+str((int(year)-1) % 100)
    else:
        return ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"][int(month)-2]+str(int(year) % 100)
    
#print(getPrevMonth("2014-01-01"))
#print(getPrevMonth("2018-02-01"))
        

In [13]:
def createOrAddWeightEdge(G, u, v, w=1):
    attr = G.get_edge_data(str(u), str(v))
    if attr is None:
        G.add_edge(str(u), str(v), weight=w)
    else:
        attr['weight'] += w
        
def addNamesToNodes(G, date):
    dataFrame = pd.read_csv(Path(dataPath/ getPrevMonth(date)/ (getPrevMonth(date)+'.csv')), header=None)
    
    for node, data in G.nodes(data=True):
        data['name'] = getNameById(int(node), dataFrame)
        

#id: FIDE id, date: yyyy-mm-01
def buildEgoNetwork(egoId, date):
    dataFrame = pd.read_csv(Path(dataPath/ getPrevMonth(date)/ (getPrevMonth(date)+'.csv')), header=None)
    adjacencyList = fetchMatches(egoId, date)
    G = nx.Graph()
    #put ego
    G.add_node(str(egoId))
    #build main links
    for nName, nRating in adjacencyList:
        #add link to ego node
        nId = getIdByNameAndRating(nName, nRating, dataFrame)
        createOrAddWeightEdge(G, egoId, nId)
    #add side links
    for nName, nRating in adjacencyList:
        #fetch neighbour's heighbours
        nId = getIdByNameAndRating(nName, nRating, dataFrame)
        nAdjList = fetchMatches(int(nId), date)
        for nnName, nnRating in nAdjList:
            #if nn is a neighbour of ego, link n and nn with an edge
            nnId = getIdByNameAndRating(nnName, nnRating, dataFrame)
            if (nnName, nnRating) in adjacencyList and nnId > nId:
                createOrAddWeightEdge(G, nId, nnId)
        
    return G

#test 1: Wesley So, September 2018
# testGraph = buildEgoNetwork(5202213, "2018-09-01")
# addNamesToNodes(testGraph, "2018-09-01")
# nx.write_gml(testGraph, "testWesleySoEgo.gml")

#test 2: Fabiano Caruana, April 2018
# testGraph = buildEgoNetwork(2020009, "2018-04-01")
# nx.write_gml(testGraph, "testFabianoCaruana.gml")

testGraph = buildEgoNetwork(13401319, "2016-12-01")
addNamesToNodes(testGraph, "2016-12-01")
nx.write_gml(testGraph, "testNekiRandom.gml")

In [14]:

#class ProcessingOutput:
#    def __init__(self, dateString):
#        self.dateString = dateString
#        dataFrame = pd.read_csv(Path(dataPath/ getPrevMonth(date)/ (getPrevMonth(date)+'.csv')), header=None)
#        self.G1 = nx.Graph()
#        self.G2 = nx.Graph()
    
    

#build the whole network graph using the csv file for the previous month and websites/links for the specified month
# date: yyyy-mm-01
def buildNetworkForMonth(date, cutoff):
    dataFrame = pd.read_csv(Path(dataPath/ getPrevMonth(date)/ (getPrevMonth(date)+'.csv')), header=None)
    
    G = nx.Graph()
    
    nameRatingMap = {}  
    
    for _, data in dataFrame.iterrows():
        if data[3] > cutoff:
            G.add_node(str(data[0]), name=data[1], ratingOld=data[3], nationality=data[2])
            nameRatingMap[(data[3], data[1])] = data[0]
        
#     print(nameRatingMap)
    
    for nodeId in tqdm(G.nodes()):
        nodeId = int(nodeId)
        adjList = fetchMatches(nodeId, date)
                                                                                                    ## PROBLEM ZVEZDICE 400+ rating diff
        
        for nName, nRating in adjList:
#             nId = getIdByNameAndRating(nName, nRating, dataFrame)
            try:
                nId = nameRatingMap[(int(nRating), nName.replace(',', ';'))]
            except KeyError:
                continue    
            if nId > nodeId:
                createOrAddWeightEdge(G, nodeId, nId)
    # dodaj rejtinge za kao label za sve        
      
    
    return G



In [15]:
def buildNonIsolateNetwork(G):
    GNonZeroDeg = nx.Graph() #graph containing only the non-zero degree subset of vertices of G and all the edges between them
    isolateList = nx.isolates(G)
    #print("Broj izolovanih: ", len(list(isolateList)))
    #print(*list(isolateList), sep='\n')
    GNonZeroDeg = G.copy()
    GNonZeroDeg.remove_nodes_from(isolateList)
    return GNonZeroDeg


In [16]:

def outputGraphFilesForMonth(date, cutoff):
    G = buildNetworkForMonth(date, cutoff)
    GNonZeroDeg = buildNonIsolateNetwork(G)
    folderName = "otp_"+date+"_over_"+str(cutoff)
    if not os.path.exists(folderName):
        os.makedirs(folderName)
    #make a destination folder and store created graphs in corresponding gephi files
    nx.write_gml(G, os.path.join(folderName, "all.gml"))
    nx.write_gml(GNonZeroDeg, os.path.join(folderName,"nonIsolate.gml"))
    

In [None]:
#outputGraphFilesForMonth("2018-04-01", 2400)

for i in dataMonths2Dig:
    outputGraphFilesForMonth("2018-"+i+"-01", 2400)
for i in dataMonths2Dig:
    outputGraphFilesForMonth("2017-"+i+"-01", 2400)
for i in dataMonths2Dig:
    outputGraphFilesForMonth("2016-"+i+"-01", 2400)
for i in dataMonths2Dig:
    outputGraphFilesForMonth("2015-"+i+"-01", 2400)
for i in dataMonths2Dig:
    outputGraphFilesForMonth("2014-"+i+"-01", 2400)
for i in dataMonths2Dig:
    outputGraphFilesForMonth("2013-"+i+"-01", 2400)


HBox(children=(IntProgress(value=0, max=2214), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2221), HTML(value='')))

In [17]:
def rank(name, rating):
    return getIdByNameAndRating(name, rating, dataFrame)


#def :
#    for n in neigbours(node):
#        if rank(n)>rank(node):
#            G.add_edge(str(nodeId), str(nId))

In [54]:
#args: String, int
def makeYearGraph(year, cutoff): #TODO: ratingOld is currently nonsense, add a node attribute that counts in how many montly graphs a node appears, maybe average monthly ratings( but do they change if a player is innactive during some months?)
    graphs = []
    graphsNZD = []
    #make a destination folder and store created graphs in corresponding gephi files
    folderName = "year_"+year+"_over_"+str(cutoff)
    if not os.path.exists(folderName):
        os.makedirs(folderName)
    for i in range(12):
        dateString = year+"-"+dataMonths2Dig[i]+"-01"
        graphs.append(buildNetworkForMonth(dateString, cutoff))
        graphsNZD.append(buildNonIsolateNetwork(graphs[-1]))
    aggregate = nx.compose_all(graphs)
    for node in aggregate.nodes():
        
        ratingsList = []
        for graph in graphs:
            if node in graph:
                monthlyRating=graph.nodes(data=True)[node]['ratingOld']
                ratingsList.append(monthlyRating)
        rating = np.mean(ratingsList)
        absolute = np.max(np.absolute(ratingsList-np.mean(ratingsList)))
        aggregate.nodes(data=True)[node]['ratingOld'] = rating
        aggregate.nodes(data=True)[node]['cnt'] = np.shape(ratingsList)
        aggregate.nodes(data=True)[node]['abs'] = absolute
        aggregate.nodes(data=True)[node]['ratingLabel'] = "{:.1f}".format(rating)+"+-"+"{:.1f}".format(absolute)
        
    nx.write_gml(aggregate, os.path.join(folderName, "all.gml"))
                
    aggregate = nx.compose_all(graphsNZD)
    for node in aggregate.nodes():
        
        ratingsList = []
        for graph in graphs:
            if node in graph:
                monthlyRating=graph.nodes(data=True)[node]['ratingOld']
                ratingsList.append(monthlyRating)
        rating = np.mean(ratingsList)
        absolute = np.max(np.absolute(ratingsList-np.mean(ratingsList)))
        aggregate.nodes(data=True)[node]['ratingOld'] = rating
        aggregate.nodes(data=True)[node]['cnt'] = np.shape(ratingsList)
        aggregate.nodes(data=True)[node]['abs'] = absolute
        aggregate.nodes(data=True)[node]['ratingLabel'] = "{:.1f}".format(rating)+"+-"+"{:.1f}".format(absolute)
        
    nx.write_gml(aggregate, os.path.join(folderName,"nonIsolate.gml"))
        
    

In [55]:
makeYearGraph("2018", 2750)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




HBox(children=(IntProgress(value=0, max=16), HTML(value='')))




HBox(children=(IntProgress(value=0, max=16), HTML(value='')))




HBox(children=(IntProgress(value=0, max=16), HTML(value='')))




HBox(children=(IntProgress(value=0, max=16), HTML(value='')))




HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [42]:
print("\xB1")

±
