In [None]:
# Parsing contry and other data that does not exist in the Meta Kaggle dataset
import requests
from bs4 import BeautifulSoup
import time
import re
import json

In [None]:
import pandas as pd
from pathlib import Path

INPUT_FOLDER = Path('/kaggle/input/meta-kaggle/')

In [None]:
REQUEST_DELAY = 0.5  # for time.sleep

In [None]:
competitions = pd.read_csv(INPUT_FOLDER / 'Competitions.csv')
teams = pd.read_csv(INPUT_FOLDER / 'Teams.csv')
users = pd.read_csv(INPUT_FOLDER / 'Users.csv' )
team_memberships = pd.read_csv(INPUT_FOLDER / 'TeamMemberships.csv')

It looks like the dataset does not contain information about a user's country. So, scraping is needed. Let's follow approach used in the following notebook: https://www.kaggle.com/sahidvelji/meet-the-kaggle-team 

## What competition are you looking at?

Set up competition name. Please use respective Slug.

In [None]:
display(competitions[competitions.HostSegmentTitle == 'Featured'].head(3))

In [None]:
# A value from the Slug column
COMPETITION_SLUG = '3d-object-detection-for-autonomous-vehicles'

In [None]:
competition_id = competitions.loc[competitions.Slug == COMPETITION_SLUG, 'Id'].values[0]
competition_id

## Get all users who participated in the selected competition

In [None]:
competition_team = teams[(teams.CompetitionId == competition_id) & (~teams.PublicLeaderboardRank.isnull())]

Number of teams:

In [None]:
len(competition_team)

In [None]:
competition_team.head()

In [None]:
competition_participants = (
    competition_team
    .merge(team_memberships, left_on='Id', right_on='TeamId', how='left')
    .merge(users, left_on='UserId', right_on='Id', how='left')[
        ['TeamName', 'PublicLeaderboardRank', 'PrivateLeaderboardRank', 'Medal', 'UserName']]
).sort_values(by='PublicLeaderboardRank')

In [None]:
competition_participants.head(10)

## Parse country from the users' profiles

In [None]:
KAGGLE_BASE_URL = "https://kaggle.com/"

In [None]:
usernames = competition_participants['UserName'].dropna()

In [None]:
subset_fields = set(['city', 'region', 'country', 'occupation', 'organization'])
users_data = []
for username in usernames.head(50):
    time.sleep(REQUEST_DELAY)
    profile_url = f'{KAGGLE_BASE_URL}{username}'
    
    result = requests.get(profile_url)
    src = result.text
    soup = BeautifulSoup(src, 'html.parser').find_all("div", id="site-body")[0].find("script")
    
    user_info = re.search('Kaggle.State.push\(({.*})', str(soup)).group(1)
    user_dict = json.loads(user_info)

    user_subset = {k:v  for k, v in user_dict.items() if k in subset_fields}
    user_subset.update({'username': username})

    users_data.append(user_subset)

In [None]:
users_data_df = pd.DataFrame(users_data)
users_data_df.head(10)

In [None]:
country_counts = users_data_df.country.value_counts(dropna=False)
country_counts

In [None]:
country_counts.plot(kind='bar');