In [None]:
import pandas as pd
import numpy as np
from datetime import datetime


In [None]:
K_PROJECT = 'Project'
K_ACCOUNT = 'Account'
K_PAYMENT_ACCOUNT = 'Payment account'
K_MERCHANT = 'Merchant'
K_ADDRESS = 'Address'
K_NOTE = 'Note'
K_TAGS = 'Tags'
K_AUTHOR = 'Author'
K_IMAGE1 = 'Image 1'
K_IMAGE2 = 'Image 2'
K_IMAGE3 = 'Image 3'
K_DATETIME = 'Date time'
K_TYPE = 'Type'
K_CATEGORY = 'Category'
K_EXPENSE = 'Expense'
K_INCOME = 'Income'
K_TRANSFER = 'Transfer'
K_INVESTING = 'Investing'
K_MONTH = 'Month'
K_AMOUNT = 'Amount'
K_TOTAL = 'Total'
K_CURRENCY_RATE = 'Currency rate (Relative standard currency)'
K_CURRENCY = 'Currency'
K_INDEX = 'index'
K_EXPENSE_SHARE = 'Expense share'


In [None]:
def parseDatetime(x: str):
    rus = {"янв": "jan",
           "февр": "feb", "фев": "feb",
           "мар": "mar",
           "апр": "apr",
           "май": "may", "мая": "may",
           "июн": "jun",
           "июл": "jul",
           "авг": "aug",
           "сент": "sep", "сен": "sep",
           "окт": "oct",
           "нояб": "nov", "ноя": "nov",
           "дек": "dec",
           "г. ": "", 
           ".": ""}
    for r, e in rus.items():
        x = x.lower().replace(r, e)
    try:
        dt = datetime.strptime(x.lower(), u'%d %b %Y %H:%M:%S')
    except:
        dt = datetime.strptime(x.lower(), u'%d%m%Y %H:%M:%S')
    return dt


In [None]:
df = pd.read_csv("data/test.csv", on_bad_lines="warn", sep="\t")
df = df.drop(columns=[K_PROJECT, K_ACCOUNT, K_PAYMENT_ACCOUNT, K_MERCHANT,
                      K_ADDRESS, K_NOTE, K_TAGS, K_AUTHOR,
                      K_IMAGE1, K_IMAGE2, K_IMAGE3, K_CURRENCY])
df[K_DATETIME] = df[K_DATETIME].apply(lambda x: parseDatetime(x))
df[K_DATETIME] = pd.to_datetime(df[K_DATETIME]).astype(np.int64)
df[K_DATETIME] = df[K_DATETIME].apply(lambda x: x/1000000000)
df[K_AMOUNT] = df[K_AMOUNT].apply(
    lambda x: x.replace(u"\u00A0", '').replace(',', '.'))
df[K_AMOUNT] = df[K_AMOUNT].astype(float)
df[K_CURRENCY_RATE] = df[K_CURRENCY_RATE].apply(lambda x: x.replace(',', '.'))
df[K_CURRENCY_RATE] = df[K_CURRENCY_RATE].astype(float)
df[K_AMOUNT] = df[K_AMOUNT]*df[K_CURRENCY_RATE]


In [None]:
start_date = datetime.fromtimestamp(df[K_DATETIME].min())
end_date = datetime.fromtimestamp(df[K_DATETIME].max())


In [None]:
expense_ = df[df[K_TYPE] == K_EXPENSE]
income_ = df[df[K_TYPE] == K_INCOME]
invest_ = df[(df[K_TYPE] == K_TRANSFER) & (df[K_CATEGORY] == K_INVESTING)]



In [None]:
def transformByCat(data:pd.DataFrame, level:str):
    if level == "M":
        date_mode = "%m-%Y"
    elif level == "Y":
        date_mode = "%Y"
    else:
        raise Exception()

    unique_cats = data[K_CATEGORY].unique()
    data_by_cat = pd.DataFrame(columns=unique_cats)

    el_list = pd.period_range(start=start_date, end=end_date, freq=level)
    el_list = [el.strftime(date_mode) for el in el_list]
    data_by_cat[K_INDEX] = el_list
    data_by_cat = data_by_cat.set_index(K_INDEX)

    for col in data_by_cat.columns:
        data_by_cat[col].values[:] = 0

    for index, row in data.iterrows():
        date = datetime.fromtimestamp(row[K_DATETIME])
        el = date.strftime(date_mode)
        cat = row[K_CATEGORY]
        value = row[K_AMOUNT]
        data_by_cat.loc[data_by_cat.index == el, cat] += abs(value)

    data_by_cat[K_TOTAL] = data_by_cat.sum(axis=1)

    return data_by_cat

def merge(expense:pd.DataFrame, income:pd.DataFrame):
    result = pd.DataFrame(index=expense.index)
    result[K_EXPENSE] = expense[K_TOTAL]
    result[K_INCOME] = income[K_TOTAL]

    result[K_EXPENSE_SHARE] = result[K_EXPENSE] / result[K_INCOME] * 100
    return result


In [None]:
expenseM = transformByCat(expense_, 'M')
incomeM = transformByCat(income_, 'M')
totalM = merge(expenseM, incomeM)

expenseY = transformByCat(expense_, 'Y')
incomeY = transformByCat(income_, 'Y')
totalY = merge(expenseY, incomeY)


In [None]:
incomeM.loc[:, incomeM.columns != K_TOTAL].plot()


In [None]:
totalM.plot()

In [None]:
ax = totalY.plot(kind='bar')
totalY[K_EXPENSE_SHARE].plot(ax=ax, secondary_y=True, color='r', ylim=(0,100))


In [None]:
"{:,}".format(totalY[K_INCOME].sum() - totalY[K_EXPENSE].sum())
