In [None]:
import math
import pandas as pd
import urllib
import urllib.request
import warnings
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm

warnings.filterwarnings('ignore')

In [None]:
# Method to download player stats
def getPlayerStats(player_id):
    try:
        # Get page
        text = urllib.request.urlopen("http://www.scoresway.com/?sport=handball&page=player&id="+str(player_id))\
                             .read()
        soup = BeautifulSoup(text,"html.parser")

        # Skip if player doen't exist
        if(soup.title.text==" - Handball - Scoresway - Results, fixtures, tables and statistics"):
            return None, None

        # Get all details from table and add them to v
        x = soup.find_all("dt")
        y = soup.find_all("dd")
        v = {}

        for a, b in zip(x,y):
            v[a.text]=b.text

        # Get all seasons, appearences and goals
        m = {"season":[], "appearences":[], "goals":[]}
        s = soup.find_all("td", {"class":"season"})
        a = soup.find_all("td", {"class":"appearances"})
        g = soup.find_all("td", {"class":"goals_in"})

        # Skip if goals are empty
        if(len(g) == 0):
            return None, None

        # Add them to dictionary
        for x,y,z in zip(s,a,g):
            m["season"].append(x.text.replace("\n",""))
            m["appearences"].append(int(y.text))
            m["goals"].append(int(z.text))

        return v,m

    except:
        return getPlayerStats(player_id)

In [None]:
# method to filter attributes dictionary
def filter_attr_dict(attr_dict):
    keys = attr_dict.keys()
    heigth_found = False
    
    for key in keys:
        attribute = key
        value = attr_dict[key]

        if 'Height' in attribute: # remove cm from height
            value = value.split(' ')[0]
            heigth_found = True

        attr_dict[attribute] = value

    attr_dict['name'] = attr_dict['First name'] + ' ' + attr_dict['Last name']
    
    if heigth_found == True:
        return attr_dict
    else:
        return None

# method to get info from performance dictionary
def get_info_from_dict(perf_dict):
    seasons = perf_dict['season']
    appearences = perf_dict['appearences']
    goals = perf_dict['goals']

    if '2018/2019' in seasons[0]:
        del(seasons[0])
        del(appearences[0])
        del(goals[0])

    seasons.reverse()
    appearences.reverse()
    goals.reverse()
    
    return seasons, appearences, goals

# method to check if previous 2 seasons exist
def check_previous_seasons_exist(seasons_list, prediction_season_index):
    season1 = seasons_list[prediction_season_index - 2]
    season2 = seasons_list[prediction_season_index - 1]
    season3 = seasons_list[prediction_season_index]
    
    season1_last_digit = int(season1[-1])
    season2_last_digit = int(season2[-1])
    season3_last_digit = int(season3[-1])
    
    diff12 = abs(season2_last_digit - season1_last_digit)
    diff23 = abs(season3_last_digit - season2_last_digit)

    return diff12 == 1 and diff23 == 1

# method to merge duplicate season info
def check_for_duplicate_seasons(seasons_list, appearences_list, goals_list):
    seasons_checked = []
    new_seasons = []
    new_matches = []
    new_goals = []
    
    for index1 in range(0, len(seasons_list)):
        current_season = seasons_list[index1]
        
        if current_season in seasons_checked:
            continue
        seasons_checked.append(current_season)
        
        for index2 in range(0, len(seasons_list)): # check for duplicate seasons
            if index2 != index1:
                other_season = seasons_list[index2]
                
                if current_season == other_season: # merge season info and delete duplicate season
                    appearences_list[index1] += appearences_list[index2]
                    goals_list[index1] += goals_list[index2]
                    
        new_seasons.append(seasons_list[index1])
        new_matches.append(appearences_list[index1])
        new_goals.append(goals_list[index1])
                    
    return new_seasons, new_matches, new_goals

In [None]:
def construct_raw_df(raw_df, player_id):
    attr_dict, perf_dict = getPlayerStats(player_id)

    if attr_dict is None or perf_dict is None:
        return raw_df

    attr_dict = filter_attr_dict(attr_dict)
    
    if attr_dict is None:
        return raw_df
    
    seasons_list, appearences_list, goals_list = get_info_from_dict(perf_dict)
    
    seasons_list, appearences_list, goals_list = check_for_duplicate_seasons(seasons_list, appearences_list, \
                                                                             goals_list)
    if len(seasons_list) < 3:
        return raw_df

    for prediction_season_index in range(2, len(seasons_list)):
        prediction_season = seasons_list[prediction_season_index]
        matches = appearences_list[prediction_season_index]
        goals = goals_list[prediction_season_index]

        season_data_ok = check_previous_seasons_exist(seasons_list, prediction_season_index)

        if season_data_ok == False:
            continue

        matches_year1 = int(appearences_list[prediction_season_index - 2])
        matches_year2 = int(appearences_list[prediction_season_index - 1])
        matches_year3 = int(appearences_list[prediction_season_index])
        goals_year1 = int(goals_list[prediction_season_index - 2])
        goals_year2 = int(goals_list[prediction_season_index - 1])
        goals_year3 = int(goals_list[prediction_season_index])

        raw_df = raw_df.append({'name':attr_dict['name'],'position':attr_dict['Position'],\
                            'height':attr_dict['Height'],'matches_year1':matches_year1,'goals_year1':goals_year1,\
                            'matches_year2':matches_year2,'goals_year2':goals_year2,'matches_year3':matches_year3,\
                            'goals_year3':goals_year3,'prediction_season':prediction_season}, ignore_index=True)
    
    return raw_df

In [None]:
raw_df = pd.DataFrame(columns=['name','position','height','matches_year1','goals_year1','matches_year2',\
                               'goals_year2','matches_year3','goals_year3','prediction_season'])

for index in tqdm(range(0, 10)):
    raw_df = construct_raw_df(raw_df, index)

raw_df

In [None]:
raw_df.to_csv('../inputs/raw_data_0-10.csv', sep=',', index=False)
len(raw_df)