In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
import seaborn as sns

In [None]:
# use this to keep the cells wider, very nice on wide screens, set the percentage as you like it
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
#display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:

conf = {
    'font.size': 14.0,
    'axes.grid': True,
    'axes.axisbelow': True,
    'axes.edgecolor': 'black',
#    'axes.facecolor': '#E5E5EF',
#    'axes.facecolor': '#E0E0EF',
    'axes.labelcolor': 'black',
    'axes.titlesize': 15.6,
    'axes.labelsize': 'large',    
    'figure.figsize': (12, 7),
    'figure.titlesize': 'x-large',
    'grid.linewidth': 1.3,
    'xtick.labelsize': 'large',
    'ytick.labelsize': 'large',
    'xtick.color': 'black',
    'ytick.color': 'black',
    'legend.fontsize': 'large',
    'figure.figsize': (12., 8.)
}

plt.rcParams.update(conf)

### base loading and cleaning

In [None]:
df_form = pd.read_excel('Feedback_Animefest_2019.xlsx', sheet_name='Form Responses 1')

In [None]:
df_form['Timestamp'] = pd.to_datetime(df_form['Timestamp'])

In [None]:
df_app = pd.read_excel('Feedback_Animefest_2019.xlsx', sheet_name='Appky')

In [None]:
df_app['ƒças feedbacku'] = pd.to_datetime(df_app['ƒças feedbacku'])

In [None]:
df_form = df_form.replace(to_replace={'üòêv pohodƒõ': 'v pohodƒõ', 'üòÉdobr√©': 'dobr√©', 'üòû≈°patn√©': '≈°patn√©', 'üòê v pohodƒõ': 'v pohodƒõ', 'üòÉ dobr√©': 'dobr√©', 'üòû ≈°patn√©': '≈°patn√©', 'üòÉAno': 'Ano', 'üòûNe': 'Ne'})

In [None]:
df_form.head()

##### several columns with same name, mutually exlusively filled, merging those

In [None]:
duplicit_columns = [i for i in df_form.columns if i.endswith('.1')]
duplicit_columns

In [None]:
[df_form[[i[:-2], i]].notnull().sum(axis=1).max() for i in duplicit_columns]

In [None]:
for i in duplicit_columns:
    assert df_form[[i[:-2], i]].notnull().sum(axis=1).max() == 1  # only one column is filled, the other is N/A
    df_form.loc[df_form[i].notnull(), i[:-2]] = df_form[df_form[i].notnull()][i]
df_form = df_form.drop(columns=duplicit_columns)

In [None]:
info_suffix = ' [Dozvƒõdƒõl(a) jsem se zaj√≠mav√© informace]'
fun_suffix = ' [Bavil(a) jsem se]'
columns_series = df_form.columns.to_series()
program_columns_ratings = columns_series[columns_series.str.contains(info_suffix, regex=False) | columns_series.str.contains(fun_suffix, regex=False)]
program_columns_ratings.head()

In [None]:
program_columns = pd.Series(program_columns_ratings.str.replace(info_suffix, '', regex=False).str.replace(fun_suffix, '', regex=False).unique())
program_columns.head()

##### some columns that should have same content have nans somewhere, fixing this

In [None]:
weird_cols = ['2. svƒõtov√° v√°lka z pohledu Japonska a ≈æivot c√≠sa≈ôe Hirohita [Dozvƒõdƒõl(a) jsem se zaj√≠mav√© informace]', '2. svƒõtov√° v√°lka z pohledu Japonska a ≈æivot c√≠sa≈ôe Hirohita [Bavil(a) jsem se]']
df_form[df_form[weird_cols[0]] == 'Nedostal(a) jsem se'][weird_cols].head()

In [None]:
# if one column marks not attended, and second is null, value is inferred
for i in program_columns:
    info_cond = df_form[i+info_suffix].isna() & df_form[i+fun_suffix].isin(['Nedostal(a) jsem se', 'Nez√∫ƒçastnil(a) jsem se'])
    fun_cond = df_form[i+fun_suffix].isna() & df_form[i+info_suffix].isin(['Nedostal(a) jsem se', 'Nez√∫ƒçastnil(a) jsem se'])
    df_form.loc[info_cond, i+info_suffix] = df_form[info_cond][i+fun_suffix]
    df_form.loc[fun_cond, i+fun_suffix] = df_form[fun_cond][i+info_suffix]


In [None]:
df_form[df_form[weird_cols[0]] == 'Nedostal(a) jsem se'][weird_cols].head()

##### some columns that should have nan or rating have not attended, fixing this

In [None]:
weird_cols = ['Maul Cosplay Q&A [Dozvƒõdƒõl(a) jsem se zaj√≠mav√© informace]', 'Maul Cosplay Q&A [Bavil(a) jsem se]']
df_form[df_form[weird_cols[1]] == 'v pohodƒõ'][weird_cols].head()

In [None]:
# if one column marks not attended, and second is rating, not attended is set to null
for i in program_columns:
    info_cond = df_form[i+fun_suffix].isin(['dobr√©', 'v pohodƒõ', '≈°patn√©']) & df_form[i+info_suffix].isin(['Nedostal(a) jsem se', 'Nez√∫ƒçastnil(a) jsem se'])
    fun_cond = df_form[i+info_suffix].isin(['dobr√©', 'v pohodƒõ', '≈°patn√©']) & df_form[i+fun_suffix].isin(['Nedostal(a) jsem se', 'Nez√∫ƒçastnil(a) jsem se'])
    df_form.loc[info_cond, i+info_suffix] = np.nan
    df_form.loc[fun_cond, i+fun_suffix] = np.nan


In [None]:
df_form[df_form[weird_cols[1]] == 'v pohodƒõ'][weird_cols].head()

##### some columns that should have same type of attendence have different. Converting 'Nedostal(a) jsem se' to 'Nez√∫ƒçastnil(a) jsem se'

In [None]:
weird_cols = ['12 ran cosplayerov√Ωch [Dozvƒõdƒõl(a) jsem se zaj√≠mav√© informace]', '12 ran cosplayerov√Ωch [Bavil(a) jsem se]']
df_form[df_form[weird_cols[0]] == 'Nedostal(a) jsem se'][weird_cols].tail()

In [None]:
# if one column marks not attended, and second is null, value is inferred
for i in program_columns:
    info_cond = (df_form[i+info_suffix] == 'Nedostal(a) jsem se') & (df_form[i+fun_suffix] == 'Nez√∫ƒçastnil(a) jsem se')
    fun_cond = (df_form[i+fun_suffix] == 'Nedostal(a) jsem se') & (df_form[i+info_suffix] == 'Nez√∫ƒçastnil(a) jsem se')
    df_form.loc[info_cond, i+info_suffix] = df_form[info_cond][i+fun_suffix]
    df_form.loc[fun_cond, i+fun_suffix] = df_form[fun_cond][i+info_suffix]


In [None]:
df_form[df_form[weird_cols[0]] == 'Nedostal(a) jsem se'][weird_cols].tail()

##### sanity checks

In [None]:
for i in program_columns:
    info_sum = (df_form[i+info_suffix] == 'Nedostal(a) jsem se').sum()
    fun_sum = (df_form[i+fun_suffix] == 'Nedostal(a) jsem se').sum()
    both_sum = ((df_form[i+info_suffix] == 'Nedostal(a) jsem se') & (df_form[i+fun_suffix] == 'Nedostal(a) jsem se')).sum()
    if info_sum != fun_sum or fun_sum != both_sum:
        print(i, info_sum, fun_sum, both_sum)
    
    assert info_sum == fun_sum == both_sum

In [None]:
for i in program_columns:
    info_sum = (df_form[i+info_suffix] == 'Nez√∫ƒçastnil(a) jsem se').sum()
    fun_sum = (df_form[i+fun_suffix] == 'Nez√∫ƒçastnil(a) jsem se').sum()
    both_sum = ((df_form[i+info_suffix] == 'Nez√∫ƒçastnil(a) jsem se') & (df_form[i+fun_suffix] == 'Nez√∫ƒçastnil(a) jsem se')).sum()
    if info_sum != fun_sum or fun_sum != both_sum:
        print(i, info_sum, fun_sum, both_sum)
    
    assert info_sum == fun_sum == both_sum

In [None]:
for i in program_columns:
    info_sum = df_form[i+info_suffix].isin(['dobr√©', 'v pohodƒõ', '≈°patn√©', np.nan]).sum()
    fun_sum = df_form[i+fun_suffix].isin(['dobr√©', 'v pohodƒõ', '≈°patn√©', np.nan]).sum()
    both_sum = df_form[i+info_suffix].isin(['dobr√©', 'v pohodƒõ', '≈°patn√©', np.nan]).sum() & df_form[i+fun_suffix].isin(['dobr√©', 'v pohodƒõ', '≈°patn√©', np.nan]).sum()
    if info_sum != fun_sum or fun_sum != both_sum:
        print(i, info_sum, fun_sum, both_sum)
    
    assert info_sum == fun_sum == both_sum

### base EDA

In [None]:
df_form.head()

In [None]:
df_form.describe()

In [None]:
[(i, len(df_form[i].unique())) for i in df_form.columns if len(df_form[i].unique()) < 10]

In [None]:
df_form.columns.map(lambda i: len(df_form[i].unique())).to_series().value_counts().sort_index().plot.bar(figsize=(25, 10))

In [None]:
df_form.columns.map(lambda i: sum(df_form[i].notnull())).to_series().value_counts().sort_index().plot.bar(figsize=(25, 10))

In [None]:
attend_columns = [i for i in df_form.columns if 'Nez√∫ƒçastnil(a) jsem se' in df_form[i].values]

In [None]:
attend_columns[:10]

In [None]:
len(attend_columns)

In [None]:
len([i for i in df_form.columns if len(df_form[i].unique()) <= 8])

In [None]:
len([i for i in df_form.columns if len(df_form[i].unique()) < 10])

In [None]:
len([i for i in df_form.columns if len(df_form[i].unique()) < 20])

In [None]:
len([i for i in df_form.columns if len(df_form[i].unique()) <= 8 and sum(df_form[i].notnull()) > 100])

In [None]:
all_attends = df_form[attend_columns].stack()
all_attends[~all_attends.isna()].value_counts().plot.pie()

In [None]:
all_attends[~all_attends.isna()].value_counts().plot.barh()

In [None]:
all_attends[(~all_attends.isna()) & ('Nez√∫ƒçastnil(a) jsem se' != all_attends)].value_counts().plot.barh()

In [None]:
df_app.head()

In [None]:
df_app.describe()

In [None]:
def abs_vals(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return '{:d}'.format(absolute)


In [None]:
feedback_geo = df_form['Bydli≈°tƒõ'].value_counts()
feedback_geo.plot.pie(autopct=partial(abs_vals, allvals=feedback_geo))

In [None]:
for i in [i for i in df_form.columns if len(df_form[i].unique()) <= 8 and sum(df_form[i].notnull()) > 100]:
    plt.figure()
    feedback_i = df_form[i].value_counts()
    feedback_i.plot.pie(autopct=partial(abs_vals, allvals=feedback_i))
    plt.show()
    
# todo: dodat filtrov√°n√≠, ≈æe tam kde je pod 20 co se z√∫ƒçastnilia ty vyhodit, udƒõlat i anal√Ωzu toho, kdo jak vyplnil z√∫ƒçastnil/nez√∫ƒçastnil jsem se

In [None]:
[i for i in df_form.columns if len(df_form[i].unique()) <= 8 and '[Dozvƒõdƒõl(a) jsem se zaj√≠mav√© informace]' not in i and '[Bavil(a) jsem se]' not in i and 'Koment√°≈ô:' not in i]

In [None]:
cols_to_plot = [i for i in df_form.columns if len(df_form[i].unique()) <= 8 and '[Dozvƒõdƒõl(a) jsem se zaj√≠mav√© informace]' not in i and '[Bavil(a) jsem se]' not in i and 'Koment√°≈ô:' not in i]
cols = 3
fig, axes = plt.subplots(int(np.ceil(len(cols_to_plot) / cols)), cols, figsize=(35, 120))

for i, col in enumerate(cols_to_plot):
    ax = axes[i // cols, i % cols]
    #plt.figure()
    feedback_i = df_form[col].value_counts()
    feedback_i.plot.pie(autopct=partial(abs_vals, allvals=feedback_i), ax=ax, y=None)
plt.show()
    
# todo: dodat filtrov√°n√≠, ≈æe tam kde je pod 20 co se z√∫ƒçastnilia ty vyhodit, udƒõlat i anal√Ωzu toho, kdo jak vyplnil z√∫ƒçastnil/nez√∫ƒçastnil jsem se

In [None]:
feedback_times = df_form['Timestamp'].value_counts(sort=False).copy().reindex().resample('1D', how='count')
feedback_times.plot.bar()
plt.title('Poƒçet vyplnƒõn√≠ za den, v≈°e ƒçerven')
#plt.gca().set_xticklabels([dt.strftime("%#d. %#m.") for dt in feedback_times.index])
plt.gca().set_xticklabels([dt.strftime("%#d.") for dt in feedback_times.index])
plt.xticks(rotation=0)
plt.show()

In [None]:
program_ratings = df_form[program_columns_ratings].apply(lambda x: x.value_counts(), axis=0)

In [None]:
attended_col = (program_ratings.loc['dobr√©'].fillna(0) + program_ratings.loc['v pohodƒõ'].fillna(0) + program_ratings.loc['≈°patn√©'].fillna(0)).rename('Z√∫ƒçastnil(a) jsem se')
program_attends = pd.concat((attended_col, program_ratings.loc['Nedostal(a) jsem se']), axis=1)
program_attends.head()

In [None]:
program_ratings = program_ratings.drop('Nez√∫ƒçastnil(a) jsem se')

In [None]:
df_form[program_rating_columns].shape, len(program_rating_columns)

In [None]:
program_ratings

In [None]:
def remove_suffix(string):
    return string.replace(' [Dozvƒõdƒõl(a) jsem se zaj√≠mav√© informace]', '').replace(' [Bavil(a) jsem se]', '')
program_ratings.groupby(program_ratings.columns.map(remove_suffix), axis=1).max()

In [None]:
type_to_color = {'p≈ôedn√°≈°ky': 'g', 'workshopy': 'purple', 'soutƒõ≈æe': 'yellow', 'dopl≈àkov√Ω program': 'b', 'prom√≠t√°n√≠': 'r', 'divadlo': 'pink'}
def patch_label_coords(p):
    offset = 0
    if p.get_width() < 0:
        offset = - 0.03
    return p.get_x() + p.get_width() + offset, p.get_y() - 0.25

In [None]:
program_attends.T.groupby(program_ratings.columns.map(remove_suffix), axis=1).max().T.sort_values('Z√∫ƒçastnil(a) jsem se', ascending=True).plot.barh(stacked=True, figsize=(30, 50))

#### splitting program to categories

In [None]:
categories = {
    'soutƒõ≈æe': [ 
        'Animekv√≠z', 'Cosplay debut', 'Cosplay soutƒõ≈æ', 'Cosplay video', 'Festdance', 'Soutƒõ≈æn√≠ AMV', 'Vyhl√°≈°en√≠ v√Ωsledk≈Ø', 
    ], 
    'p≈ôedn√°≈°ky': [
        '12 ran cosplayerov√Ωch', '2. svƒõtov√° v√°lka z pohledu Japonska a ≈æivot c√≠sa≈ôe Hirohita', '3D tisk v cosplayi: Od modelov√°n√≠ po barven√≠', 'Alternativn√≠ m√≥da: Od pono≈æky po klobouk', 'An Introduction to Leather Crafting', 
        'Anime svƒõty, ve kter√Ωch (ne)chcete ≈æ√≠t', 'BJD od Ludv√≠ka XIV. a≈æ po BTS', 'Bojuj podle sv√©ho charakteru', 'Cosplay and Otaku History in Japan', 'Cosplay armor: From reference to wearable costume', 
        'Crossdressing - the hell is dat?', 'Designing Male Characters Fanservice: The Struggle', 'Do Japonska za idoly', 'Doll Photography', 'Fake Is Sad/Bootleg Panel', 'Filmov√≠ skladatel√© Japonska ƒç. 2', 
        'Gaƒça hry aneb Lootboxy na asijsk√Ω zp≈Øsob', 'Gej≈°a: Tajemstv√≠ za stƒõnami ƒçajovny', 'Ghibli zn√°m√© nezn√°m√©', 'Godzilla, just a rubber monster, right?', 'Gothic ‚Äì hudba, m√≥da, subkultura', 
        'Historick√° p≈ôesnost zbroj√≠ v anime na vybran√Ωch p≈ô√≠kladech', 'Indon√©sie ‚Äì zemƒõ ohnƒõ', 'Jak na anime make-up', 'Jak na psan√≠ ‚Äì Od teorie k praxi', 'Jak se p≈ôipravit na cosplay focen√≠', 'Jak sehnat (nejen) jaoi v totalitn√≠m st√°tu', 
        'Jak si plnit sny pomoc√≠ anime', 'Japanacorps, WTF?!', 'Japonsk√Ω rok v emod≈æi', 'Letem jin√Ωm svƒõtem', 'Lolitou ka≈æd√Ω den', 'L√≠ƒçen√≠ pro trdla od trdla', 'Madoka Magica a ti druz√≠, co selhali', 'Manga novinky u n√°s!', 
        'Manga pro dosp√≠vaj√≠c√≠ d√≠vky: Co na n√≠ milujeme a nesn√°≈°√≠me', 'Maul Cosplay Q&A', 'My Hero Academy', 'M√≥dn√≠ p≈ôehl√≠dka 101', 'Nebojte se zahraniƒçn√≠ch soutƒõ≈æ√≠!', 'Nov√Ω kresl√≠≈ô na sc√©nƒõ: umƒõl√° inteligence!', 
        'Oƒçek√°v√°n√≠ vs. realita: ƒålovƒõk 2.0, mecha a vesm√≠rn√© lodƒõ', 'Pen & Paper role-playing hry', 'Proƒç (ne)jet do Ji≈æn√≠ Koreje', 'Recept√°≈ô tetiƒçky Mitsu nejen pro zaƒç√≠naj√≠c√≠ lolity', 'Remaky anime', 'Reputace & renesance D&D', 
        'Resident Evil ‚Äì Od zrozen√≠ po souƒçasnost', 'R√°men po ƒçesku', 'Sekiro, Nioh a ti dal≈°√≠‚Ä¶', 'Slasti a strasti japonsk√©ho randƒõn√≠', 'Slavnostn√≠ zah√°jen√≠ + k≈ôest Vƒõj√≠≈ôe', 'Stereotyp ‚Äì N√ÅMITKA!', 'Svƒõt japonsk√Ωch idol≈Ø', 
        'Takov√° norm√°ln√≠ japonsk√° rodinka', 'Viktori√°nsk√© √∫pravy vlas≈Ø', 'World cosplay panel', 'Zakonƒçen√≠ conu', 'Z√°ke≈ôn√© vody videohern√≠ho pir√°tstv√≠', 
    ], 
    'workshopy': [
        'Aikido Ikigai Dojo Brno ', 'Aikikai Aikido Brno ', 'Boj d√Ωkou a no≈æem', 'Boj tes√°kem', 'Choker', 'Cosplay Act Workshop', 'Yoshinkan aikido', 'Kaligrafie', 'Kendo', 'Kensei Dojo Brno ', 'Kimono workshop', 'Sraz AMV', 
        'H√°ƒçkov√°n√≠ ply≈°ov√Ωch zv√≠≈ô√°tek', 'I p√°d je posun vp≈ôed', 'Jak se p≈ôi cosplayi neztrapnit se zbran√≠', 'Jo, bokken, tanto aneb Obrana proti japonsk√Ωm zbran√≠m', 'Kvƒõtiny Ikebana', 'M√°k a jeho klacek aneb I monk si s hol√≠ leccos dovol√≠', 
        'N√°u≈°nice ze sklenƒõn√Ωch kapek', 'Od z√°klad≈Ø po pokroƒçilej≈°√≠ retu≈°e nejen cosplay fotografie', 'Omalov√°nky', 'Origami Workshop', 'Rozcviƒçka pro dru≈æinku aneb Draƒç√≠ doupƒõ po r√°nu', 'Ryt√≠≈ô, meƒç a ≈°t√≠t aneb L√≠n√Ω ≈°erm', 
        'R≈Ø≈æe ze sat√©nov√Ωch stu≈æek', 'Sebeobrana nejen pro d√°my', 'Sebeobrana po japonsku', 'Taneƒçn√≠ k-pop workshop', 'Vy≈°√≠v√°n√≠', 'V√°leƒçn√≠k mazlivƒõ obouruƒç√°kem tƒõ mlaskne', 'V√Ωroba vlastn√≠ho di√°≈ôe/skic√°ku', 
        'Workshop animace v Clip Studio Paint', 'Z√°klady sv√≠cen√≠ s profesion√°ln√≠mi z√°blesky a jejich modifik√°tory'
    ], 
    'prom√≠t√°n√≠': [
        '3D holka', 'A to je ≈°patnƒõ, kdy≈æ se chod√≠m seznamovat do kobek? Orion≈Øv ≈°√≠p', 'B≈Øh ≈æehnej tomuto n√°dhern√©mu svƒõtu', 'Fashion Mix', 'Gamers!', 'J√° mal√Ω ƒçarodƒõjka', 'Kvƒõtina zasl√≠ben√° rann√≠mu louƒçen√≠', 'Lolita Music Mix: Novinky', 
        'Mirai, d√≠vka z budoucnosti', 'M√≥dn√≠ kolekce 2019', 'Noc je kr√°tk√°, tak kr√°ƒçej d√°l, dƒõvƒçe', 'Penguin Highway', 'Pohod√°≈ôky', 'ReLIFE', 'Symfonie z jin√©ho svƒõta', 'Tipy pro ≈æivot mimo realitu', 'Vzestup hrdiny ≈°t√≠tu', 
        'Zombie Land Saga', 
    ],
    'dopl≈àkov√Ω program': [
        'AMV Bud√≠ƒçek', 'AMV Mortal Combat', 'AMV Noƒçn√≠k', 'AMV Veƒçern√≠ƒçek', 'BTS Army sraz', 'Vƒõj√≠≈ô ‚Äì autogrami√°da a beseda', 'Anime novinky oƒçima prot≈ôel√Ωch fanou≈°k≈Ø', 'Budoucnost ƒçesk√Ωch anime con≈Ø', 
        'K-pop ‚Äì random dance CZHW', 'Sƒç√≠t√°n√≠ ≈æije!', 'Turnaj v ≈°ermu Ar√©na Alerie', 'Draƒç√≠ doupƒõ', 
    ],
    'divadlo': ['Festovn√≠ koncert', 'Kamui ‚Äì The Samurai Sword Artists', 'Lolita Fashion Show', 'Mal√© divadlo kj√≥genu', ],
    }

In [None]:
program_attendance = program_attends.T.groupby(program_ratings.columns.map(remove_suffix), axis=1).max().T['Z√∫ƒçastnil(a) jsem se'].to_frame()
program_attendance['type'] = program_attendance.index.to_series().map(col_to_category)

sorted_attendance = program_attendance.sort_values(by='Z√∫ƒçastnil(a) jsem se', ascending=True)
sorted_attendance['Z√∫ƒçastnil(a) jsem se'].sort_values(ascending=True).plot.barh(stacked=True, figsize=(30, 50), color=sorted_attendance['type'].map(type_to_color))
for p in plt.gca().patches:
    plt.gca().annotate("%.0f" % p.get_width(), patch_label_coords(p), xytext=(5, 10), textcoords='offset points')

In [None]:
col_to_category = {col: category for category, cols in categories.items() for col in cols}
col_to_category

In [None]:
program_attendance['type'] = program_attendance.index.to_series().map(col_to_category)
program_attendance.head()

In [None]:
program_attendance.groupby('type')['Z√∫ƒçastnil(a) jsem se'].sum().sort_values(ascending=True).plot.barh()

In [None]:
program_attendance.groupby('type')['Z√∫ƒçastnil(a) jsem se'].mean().sort_values(ascending=True).plot.barh()

In [None]:
program_attendance.groupby('type')['Z√∫ƒçastnil(a) jsem se'].count().sort_values(ascending=True).plot.barh()

In [None]:
sns.violinplot(x='type', y='Z√∫ƒçastnil(a) jsem se', data=program_attendance, inner='stick', bw=.3)
plt.title('n√°v≈°tƒõvnost')
plt.xticks(rotation=15)

In [None]:
rating_to_num = {'≈°patn√©': -1, 'v pohodƒõ': 0, 'dobr√©': 1}
fill_to_num = {'chci hodnotit struƒçnƒõ': -1, 'chci hodnotit podrobnƒõ (je toho opravdu hodnƒõ, ƒç√≠m v√≠c vypln√≠te, t√≠m vdƒõƒçnƒõj≈°√≠ budeme)': 1}
app_to_num = {'Ne': -1, 'Ne, nevƒõdel sem o n√≠': -1, 'Ne, z jin√©ho d≈Øvodu': -1, 'Ne, nem√°m podporovan√© za≈ô√≠zen√≠': -1, 'Ne, nedok√°zal jsem ji spustit': -1, 'Ne, nezaj√≠m√° mƒõ': -1, np.nan: 0, 'Ano': 1}
bool_to_num = {'Ne': -1, np.nan: 0, 'Ano': 1}
age_to_num = {'13-15': 1, '16-18': 2, '19-25': 3, '26 a v√≠ce': 4}
    
#sns.pairplot(df_form[cols_to_plot], hue='Pohlav√≠')
plt.figure(figsize=(20, 20))
g = sns.pairplot(df_form[['Vƒõk', 'Pohlav√≠', 'Jak podrobnƒõ chcete hodnotit?', 'P≈ôipad√° v√°m AF t√©ma pro z√°≈æitek na conu d≈Øle≈æit√©?', 'Pou≈æ√≠vali jste Animefest aplikaci?']].replace({
    'Vƒõk': age_to_num, 'Jak podrobnƒõ chcete hodnotit?': fill_to_num, 'P≈ôipad√° v√°m AF t√©ma pro z√°≈æitek na conu d≈Øle≈æit√©?': bool_to_num, 'Pou≈æ√≠vali jste Animefest aplikaci?': app_to_num
}), hue='Pohlav√≠')
[plt.setp(ax.xaxis.get_label(), rotation=15) for ax in g.axes.flat]
[plt.setp(ax.yaxis.get_label(), rotation=75) for ax in g.axes.flat]


In [None]:
program_ratings_fun = program_ratings[program_columns + fun_suffix].loc[['dobr√©', '≈°patn√©', 'v pohodƒõ']].T.fillna(0)
program_ratings_fun.head()

In [None]:
program_ratings_info = program_ratings[program_columns + info_suffix].loc[['dobr√©', '≈°patn√©', 'v pohodƒõ']].T.fillna(0)
program_ratings_info.head()

In [None]:
program_fun_score = (program_ratings_fun['dobr√©'] * 1 + program_ratings_fun['v pohodƒõ'] * 0 + program_ratings_fun['≈°patn√©'] * -1) / program_ratings_fun.sum(axis=1)
program_fun_score.index = program_fun_score.index.str.replace(fun_suffix, '', regex=False)
program_fun_score.head()

In [None]:
program_info_score = (program_ratings_info['dobr√©'] * 1 + program_ratings_info['v pohodƒõ'] * 0 + program_ratings_info['≈°patn√©'] * -1) / program_ratings_info.sum(axis=1)
program_info_score.index = program_info_score.index.str.replace(info_suffix, '', regex=False)
program_info_score.head()


In [None]:
program_fun_score = program_fun_score.to_frame('score')
program_fun_score['type'] = program_fun_score.index.to_series().map(col_to_category)
program_fun_score.head()

In [None]:
program_info_score = program_info_score.to_frame('score')
program_info_score['type'] = program_info_score.index.to_series().map(col_to_category)
program_info_score.head()

In [None]:
sorted_score = program_fun_score.sort_values(by='score', ascending=True)
sorted_score['score'].plot.barh(figsize=(30, 50), color=sorted_score['type'].map(type_to_color))
for p in plt.gca().patches:
    plt.gca().annotate("%.2f" % p.get_width(), patch_label_coords(p), xytext=(5, 10), textcoords='offset points')

In [None]:
sorted_score = program_info_score.sort_values(by='score', ascending=True)
sorted_score['score'].plot.barh(figsize=(30, 50), color=sorted_score['type'].map(type_to_color))
for p in plt.gca().patches:
    plt.gca().annotate("%.2f" % p.get_width(), patch_label_coords(p), xytext=(5, 10), textcoords='offset points')

In [None]:
sns.violinplot(x='type', y='score', data=program_fun_score, inner='stick', bw=.3)
plt.title('z√°bavnost')
plt.xticks(rotation=15)

In [None]:
sns.violinplot(x='type', y='score', data=program_info_score, inner='stick', bw=.3)
plt.title('informovanost')
plt.xticks(rotation=15)

In [None]:
### can't see number of them, need parallel sets

In [None]:
{i: df_form[i].unique() for i in cols_to_plot}

In [None]:
import plotly.graph_objects as go
import pandas as pd

titanic_df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/titanic.csv")

# Create dimensions
class_dim = go.parcats.Dimension(
    values=titanic_df.Pclass,
    categoryorder='category ascending', label="Class"
)

gender_dim = go.parcats.Dimension(values=titanic_df.Sex, label="Gender")

survival_dim = go.parcats.Dimension(
    values=titanic_df.Survived, label="Outcome", categoryarray=[0, 1], 
    ticktext=['perished', 'survived']
)

# Create parcats trace
color = titanic_df.Survived;
colorscale = [[0, 'lightsteelblue'], [1, 'mediumseagreen']];

fig = go.Figure(data = [go.Parcats(dimensions=[class_dim, gender_dim, survival_dim],
        line={'color': color, 'colorscale': colorscale},
        hoveron='color', hoverinfo='count+probability',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')])

fig.show()

### analysis of open answers

In [None]:
from urllib import request

data = request.urlopen('https://raw.githubusercontent.com/stopwords-iso/stopwords-cs/master/stopwords-cs.txt')
all_stopwords = [line.decode('utf-8').replace('\n', '') for line in data]
# todo: split them correctly and for vocalulary calculation use only emotes filtering
emotes_stopwords = [':d', ':', '\?', ';', ':-d', '\^\^', ':dd', 'xd', ':3', '=d', ':p',  ':/', ':>', '\+', 'o_o', 'd:', '>', ':\'d', ':"d', '\n', ':dd', ':ddd', ':dddd', 'x', 'xp']
my_stopwords = ['no', '', 'jo', 'sa', 'the', '=d', 'fakt', 'celkem', 'sem', 'ja', 'som', 'ako', 'of', 'nekdo', 'bych', 'sou', '2', 'jj', 'is', 'mam', '3', '\n']
all_stopwords += my_stopwords
all_stopwords += emotes_stopwords

def messages_to_words_hist(mess_series):
    mess_series = mess_series.fillna('')
    all_words = mess_series.str.split(' ', expand=True).stack().reset_index(drop=True, level=1)
    words_historgram = all_words.str.lower().str.strip('.,?-‚Äì()').value_counts()
    stopwords = list(set(all_stopwords).intersection(set(words_historgram.index)))
    words_historgram_cleaned = words_historgram.drop(stopwords)
    return words_historgram_cleaned


In [None]:
best_mess = messages_to_words_hist(df_form['S ƒç√≠m jste byli letos nejv√≠ce spokojeni / co se v√°m nejv√≠ce l√≠bilo?'])
best_mess[:30].plot.bar(figsize=(30, 10), title='S ƒç√≠m jste byli letos nejv√≠ce spokojeni / co se v√°m nejv√≠ce l√≠bilo?')
plt.xticks(rotation=30)

In [None]:
with pd.option_context('display.max_colwidth', 150):
    display(df_form[df_form['S ƒç√≠m jste byli letos nejv√≠ce spokojeni / co se v√°m nejv√≠ce l√≠bilo?'].str.contains('p≈ôedn√°≈°ky', regex=False)]['S ƒç√≠m jste byli letos nejv√≠ce spokojeni / co se v√°m nejv√≠ce l√≠bilo?'])

In [None]:
best_mess = messages_to_words_hist(df_form['Co v√°s letos nejv√≠c zklamalo / nel√≠bilo se v√°m?'])
best_mess[:30].plot.bar(figsize=(30, 10), title='Co v√°s letos nejv√≠c zklamalo / nel√≠bilo se v√°m?')
plt.xticks(rotation=30)

In [None]:
with pd.option_context('display.max_colwidth', 150):
    display(df_form[df_form['Co v√°s letos nejv√≠c zklamalo / nel√≠bilo se v√°m?'].str.contains('p≈ôedn√°≈°ky', regex=False)]['Co v√°s letos nejv√≠c zklamalo / nel√≠bilo se v√°m?'])

In [None]:
best_mess = messages_to_words_hist(df_form['Co bychom podle v√°s mƒõli do p≈ô√≠≈°t√≠ho roku zlep≈°it?'])
best_mess[:30].plot.bar(figsize=(30, 10), title='Co bychom podle v√°s mƒõli do p≈ô√≠≈°t√≠ho roku zlep≈°it?')
plt.xticks(rotation=30)

In [None]:
best_mess = messages_to_words_hist(df_form['Co bychom podle v√°s mƒõli do p≈ô√≠≈°t√≠ho roku zlep≈°it?'])
best_mess[:7].plot.bar(figsize=(30, 10), title='Co bychom podle v√°s mƒõli do p≈ô√≠≈°t√≠ho roku zlep≈°it?')
plt.xticks(rotation=30)