In [1]:
import tqdm
import json
import requests
import pandas as pd
import datetime as dt
import dateutil as du
import itertools as its

from dateutil.relativedelta import relativedelta

In [2]:
# For dates parsing
import locale
locale.setlocale(locale.LC_TIME, 'ru_RU.UTF-8')

'ru_RU.UTF-8'

In [3]:
SE2CODE = {
    'yandex': 1839501,
    'search': 1839500
}
SES = list(SE2CODE.keys())
SES

['yandex', 'search']

Парсим до `2017-09-28`, потому что такая правая граница была в готовом датасете, с следующий код основывается на нем.

In [4]:
FIRST_DAY, LAST_DAY = du.parser.parse('2014-01-01'), du.parser.parse('2017-09-28')
'[{}, {}]'.format(FIRST_DAY, LAST_DAY)

'[2014-01-01 00:00:00, 2017-09-28 00:00:00]'

In [5]:
END_DATES = pd.date_range(
    FIRST_DAY - relativedelta(days=31), 
    end=LAST_DAY + relativedelta(days=31), 
    freq='30d'
).map(lambda _: str(_)[:10]).tolist()
END_DATES[:4]

['2013-12-01', '2013-12-31', '2014-01-30', '2014-03-01']

In [6]:
def fetch_raw(se, end_date):
    return requests.get(
        'https://top.mail.ru/json/browsersdynamics.hc?id={}&period=0&date={}&aggregation=sum&sids=firefox,opera,msie,chrome,chrome/chrome-yab&ytype=hits&gtype=line'
        .format(SE2CODE[se], end_date)
    ).text


def parse_date(raw_date):
    return dt.datetime.strptime(raw_date, '%d %b %Y')

In [7]:
def parse_raw(raw, se, end_date):
    raw_json = json.loads(raw)
    
    date = pd.Series(list(map(parse_date, raw_json['xAxis']['categories'])))
    assert len(date) == 30
    
    if len(raw_json['series']) == 5:
        zz = ((d['data'], d['sid']) for d in raw_json['series'])
    else:
        zz = zip(its.repeat(raw_json['series'][0]['data']),
                ['chrome', 'firefox', 'msie', 'opera', 'yabrowser'])
    zz = list(zz)
    assert len(zz) == 5
    
    parts = []
    for data, browser in zz:
        part = pd.DataFrame(data).rename({'y': 'hits', 0: 'hits', '0': 'hits'}, axis=1)
        part['date'] = date
        part['se'] = se
        part['browser'] = browser if browser != 'chrome/chrome-yab' else 'yabrowser'
        part = part[['date', 'se', 'browser', 'hits']]
        assert(len(part) == 30)
        parts.append(part)
    df = pd.concat(parts)
    df.sort_values(df.columns.tolist(), inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.loc[df.browser == 'chrome', 'hits'] = \
        df[df.browser == 'chrome']['hits'].as_matrix() - df[df.browser == 'yabrowser']['hits'].as_matrix()
    assert len(df) == 150
    return df

In [8]:
%%time
parts = []
for se, end_date in tqdm.tqdm_notebook(its.product(SES, END_DATES)):
    raw = fetch_raw(se, end_date)
    parts.append(parse_raw(raw, se, end_date))

df = pd.concat(parts)
df = df[(df.date >= FIRST_DAY) & (df.date <= LAST_DAY)]
df.sort_values(df.columns.tolist(), inplace=True)
df.reset_index(drop=True, inplace=True)
assert len(df) == ((LAST_DAY - FIRST_DAY).days + 1) * len(SES) * 5


CPU times: user 4.55 s, sys: 172 ms, total: 4.72 s
Wall time: 1min 53s


In [9]:
df.head()

Unnamed: 0,date,se,browser,hits
0,2014-01-01,search,chrome,44778823
1,2014-01-01,search,firefox,16417487
2,2014-01-01,search,msie,7056536
3,2014-01-01,search,opera,12246238
4,2014-01-01,search,yabrowser,8845385


In [10]:
df.tail()

Unnamed: 0,date,se,browser,hits
13665,2017-09-28,yandex,chrome,10982236
13666,2017-09-28,yandex,firefox,5137327
13667,2017-09-28,yandex,msie,2133180
13668,2017-09-28,yandex,opera,3448505
13669,2017-09-28,yandex,yabrowser,11082319


In [11]:
df.dtypes

date       datetime64[ns]
se                 object
browser            object
hits                int64
dtype: object

In [12]:
df.to_csv('my_raw_data.csv', index=False)