In [125]:
import numpy as np
import pandas as pd
import math
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import locale
locale.setlocale(locale.LC_ALL, '')
%matplotlib inline

# Bokeh is cool.
from bokeh.io import show,curdoc, output_notebook, export_png, export_svgs #output_file
from bokeh.models import Quad, FactorRange, LabelSet
from bokeh.layouts import row, layout,widgetbox
from bokeh.models.widgets import Select,MultiSelect
from bokeh.plotting import ColumnDataSource,figure,reset_output,gridplot
from bokeh.palettes import Spectral6

reset_output()
output_notebook()


import pickle

# Ensure vector figure output uses editable text
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
plt.rcParams['svg.fonttype'] = 'none'
# Autolayout on graphs
plt.rcParams['figure.autolayout'] = True

# Read in our preprocessed data
df = pd.read_csv('./export_csv/processed_output_without_column_names.csv', encoding = 'utf8')
with open('SavedVariables.pckl', 'rb') as f:
    continous_cols, cate_cols, cate_cols_v2, pipe_col, col_dict, pipe_col_dict = pickle.load(f)


In [92]:
AIGA_col = pipe_col_dict['V_32']['AIGA']
print('The total number of AIGA responses: ' + str(len(df[df[AIGA_col] == 1])))
print('The total number of Non-AIGA responses: ' + str(len(df[df[AIGA_col] == 0])))

The total number of AIGA responses: 4638
The total number of Non-AIGA responses: 7839


In [3]:
def summary_cate_col(col, AllorAIGA = 'All'): #default is all records
    
    AIGA_col = pipe_col_dict['V_32']['AIGA']
    if AllorAIGA == 'All':
        df_filtered = df
    elif AllorAIGA =="NotAIGA":
        df_filtered = df[df[AIGA_col] == 0]
    else:
        df_filtered = df[df[AIGA_col] == 1]

        
    if col in continous_cols:
        return
    
    elif col in pipe_col:
        dff = pd.DataFrame(columns=['Options','Count','Percent'])
        for key, value in pipe_col_dict[col].items():
            count = df_filtered[value].sum()
            dff = dff.append({'Options': key, 'Count' : count}, ignore_index=True)
    else:
        dff = pd.DataFrame(df_filtered[col].value_counts())
        dff.reset_index(inplace=True)
        dff.columns = ['Options','Count']
    
    dff['Percent'] = dff.Count / len(df_filtered)
    dff = dff.sort_values(by = 'Percent', ascending = False)
    return dff


In [None]:
# Vertical bar plot

def plotVBar(dff, AllorAIGA):

    source = ColumnDataSource(dict(x= dff['Options'],y=dff['Count']))

    x_label = "Categories"
    y_label = "Respondents"
    title = AllorAIGA + ': ' + col_dict[col] 
    
    plot = figure(plot_width=700, plot_height=500,
            x_axis_label = x_label,
            y_axis_label = y_label,
            title=title,
            x_range=FactorRange(factors=list(dff['Options']))
            )

    plot.vbar(source=source,x='x',top='y',bottom=0,width=0.3)
    show(plot)


# Horizontal bar graph
# dff - dataframe
# t - title of chart
def plotHBar(dff, t):

    #round percentages
    dff['Percent'] = pd.Series(["{0:.2f}%".format(val * 100) for val in dff['Percent']], index = dff.index)
    source = ColumnDataSource(dict(x=dff['Count'],y=dff['Options'], pct=dff['Percent']))
    x_label = "Respondents"
    y_label = "Categories"
    title = t + ': ' + col_dict[col] 
    
    #constant width, height variable based on options in the Y axis
    h = 100 * dff['Options'].shape[0]
    maxX = 50 + max(dff['Count'])
    maxX += .2*maxX
    
    plot = figure(plot_width=1400, plot_height=h,
            x_axis_label = x_label,
            y_axis_label = y_label,
            title=title, x_range=(0, maxX),
            y_range=FactorRange(factors=list(dff['Options']))
            )
    plot.min_border_right = 50
    
    pctLabels = LabelSet(x='x', y='y', text='pct', text_font_size='8pt', level='glyph',
            x_offset=10, y_offset=2, source=source, render_mode='canvas')

    countLabels = LabelSet(x='x', y='y', text='x', text_font_size='8pt', level='glyph',
            x_offset=10, y_offset=-12, source=source, render_mode='canvas')

    plot.hbar(source=source,y='y',right='x',left=0,height=0.75)
    plot.add_layout(pctLabels)
    plot.add_layout(countLabels)
    plot.output_backend = "svg"
    
    export_svgs(plot, filename = t + '-' + col_dict[col] + str(max_count) + '.svg')
    #export_png(plot, t + '-' + col_dict[col] + '.png' )
    
    #show(plot)


In [129]:
for col in cate_cols:
    if 'V_8' in col:
        continue
    if 'V_35' in col:
        continue
    if 'V_36' in col:
        continue
    
    
    # get summary table each column (All)
    #if 'V_38' in col:
    dff = summary_cate_col(col, AllorAIGA = 'All')
    plotHBar(dff, 'All')
    
    # get summary table each column (AIGA-Only)
    #dff = summary_cate_col(col, AllorAIGA = 'AIGA-Only')
    #plotColumn(dff, AllorAIGA = 'AIGA-Only')
    
    # get summary table each column (Not AIGA)
    #dff = summary_cate_col(col, AllorAIGA = 'NotAIGA')
    #plot_single_column(dff, AllorAIGA = 'NotAIGA')