In [37]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/all_results.csv')

In [38]:
team_names = df['team'].dropna()
unique_team_names = team_names.unique()
{'total_items': len(team_names), 'unique_items': len(unique_team_names)}

{'total_items': 7090, 'unique_items': 3367}

In [39]:
# Normalized team names stats
normalized_team_names = df['teamNormalized'].dropna()
unique_normalized_team_names = normalized_team_names.unique()
{'total_items': len(normalized_team_names), 'unique_items': len(unique_normalized_team_names)}

{'total_items': 6846, 'unique_items': 3042}

In [41]:
# Raw team name values (top values only)
team_names.groupby(team_names.values).count().sort_values(ascending=False).head(25)

I love running          96
нет                     74
Парсек                  72
IRC                     69
I LOVE RUNNING          66
I Love Running          65
42TRIP                  62
лично                   53
Trilife                 50
БИМ                     49
21runners               48
Nike+                   45
RRUNS                   42
КЛБ                     37
University Lyon 1       37
0                       34
Сенеж                   32
Трилайф                 32
Urbani tekaci           32
Тихоходы                31
-                       31
adidas Boost team       30
Iloverunning            29
Moskva River Runners    28
Факел                   27
dtype: int64

In [42]:
# Team filters debugging
filtered = team_names[team_names.str.lower().str.contains('московский')]

print(len(filtered))
filtered.unique()

23


array(['Московский Институт Физической Культуры и Спорта',
       'Московский Беговой Клуб', 'Московский беговой клуб',
       'Московский Беговой клуб', 'московский беговой клуб'], dtype=object)

In [43]:
# Team filters
team_name_filter = team_names.str.replace(' ', '').str.lower()
team_filters_list = [
    ('I love running', team_name_filter.str.startswith('ilover') | team_name_filter.str.startswith('ilr')),
    ('Adidas', team_name_filter.str.contains('adidas') | team_name_filter.str.contains('адидас')),
    ('Трилайф', team_name_filter.str.contains('trilife') | team_name_filter.str.contains('трилайф')),
    ('МГУ', team_name_filter.str.contains('мгу') & (team_name_filter != 'самгу')),
    ('World class', team_name_filter.str.contains('world') & team_name_filter.str.contains('class')),
    ('Orange Polska', team_name_filter.str.contains('orange') & team_name_filter.str.contains('polska')),
    ('Gorky park runners', team_name_filter.str.contains('gorky') & team_name_filter.str.contains('park')),
    ('Run studio', team_name_filter.str.contains('runstudio')),
    ('Running expert', team_name_filter.str.contains('expert') & team_name_filter.str.contains('run')),
    ('Гепард', team_name_filter.str.contains('gepard') | team_name_filter.str.contains('гепард')),
    ('Moskvariverrunners', team_name_filter.str.contains('river') & team_name_filter.str.contains('run')),
    ('21runners', team_name_filter.str.contains('21') & team_name_filter.str.contains('runners')),
    ('Парсек', team_name_filter.str.contains('parsek') | team_name_filter.str.contains('парсек')),
    ('Girl&Sole', team_name_filter.str.contains('girl') & team_name_filter.str.contains('sole')),
    ('42Trip', team_name_filter.str.contains('42trip')),
    ('42km.ru', team_name_filter.str.contains('42км.ru') | team_name_filter.str.contains('42km.ru')),
    ('Nike+', team_name_filter.str.contains('nike') & (team_name_filter.str.contains('\+') | team_name_filter.str.contains('plus'))),
    ('Nike+', team_name_filter.str.contains('найк')),
    ('Лыжный клуб Измайлово', team_name_filter.str.contains('измайлово') & (team_name_filter.str.contains('лыжный') | team_name_filter.str.contains('лк'))),
    ('Московский беговой клуб', team_name_filter.str.contains('московскийбеговойклуб')),
    ('EY', team_name_filter.str.startswith('ey')),
    ('IRC', team_name_filter.str.startswith('irc')),
    ('БИМ', team_name_filter.str.startswith('бим')),
    ('Факел', team_name_filter.str.startswith('факел')),
    ('Энергия', team_name_filter.str.startswith('энергия')),
    ('Сенеж', team_name_filter.str.startswith('сенеж')),
    
    (np.nan, (team_name_filter == '') | (team_name_filter == 'лично') | (team_name_filter == 'нет')),
    (np.nan, (team_name_filter == '-') | (team_name_filter == '0')),
]

non_filtered_names = team_names.copy()
names_copy = team_names.copy()
for team_name, team_filter in team_filters_list:
    names_copy[team_filter] = team_name
    non_filtered_names[team_filter] = np.nan

df = pd.DataFrame({'name' : names_copy})
df.groupby(['name']).size().rename({'1' : 'count'}).sort_values(ascending=False).head(10)

name
I love running    383
Adidas            202
Трилайф           148
Nike+             116
Парсек             93
21runners          80
БИМ                78
IRC                72
World class        71
42Trip             69
dtype: int64

In [47]:
# Non filtered team names
# Maybe it's a source for new filters
non_filtered_name_counts = non_filtered_names.groupby(non_filtered_names.values).count().sort_values(ascending=False)
non_filtered_name_counts[non_filtered_name_counts >= 10]

RRUNS                42
University Lyon 1    37
КЛБ                  37
Urbani tekaci        32
Тихоходы             31
Динамо               22
Строительный двор    22
МФТИ                 20
Новотор              19
СК                   17
Velo36               16
КЛЭБ                 16
RunLepraRun          16
Айвика               15
Мещера               15
No Tag Runners       15
Сокол                15
Wake&Run             15
Аэробия              14
Piranha              14
СК Ромашково         14
Мир                  13
Меркурий             13
Galaxy               12
МАИ                  11
Крылатый батальон    10
Сормович             10
Seibukai             10
MOEXRUN              10
НэО                  10
Стимул               10
dtype: int64