# Python for Data Science Introduction

This is tutorial was presented at the Jackson (Mississippi) FreeCodeCamp meeting in June 2019.

Data was retrieved from the [open data portal](https://open.jacksonms.gov/dataset/events-in-the-city1) for the Jackson, MS city government.

In [42]:
import json
import urllib.request

import pandas as pd

data_url = "https://open.jacksonms.gov/datastore/dump/2fd76604-4378-4e19-9f0f-1bf220825c97?format=json"

fileobj = urllib.request.urlopen(data_url)
response_dict = json.loads(fileobj.read())

# Get columns from json data
columns = [f["id"] for f in response_dict["fields"]]

# Convert to DataFrame
df = pd.DataFrame(response_dict['records'], columns=columns)

# Display the first few rows of the DataFrames
print(df.head(10))

   _id  Date                       Leasee  \
0    1  2009         Gemini Entertainment   
1    2  2009              W. Kessler, LTD   
2    3  2009         Mississippi Symphony   
3    4  2009  Mississippi Music Educators   
4    5  2009              W. Kessler, LTD   
5    6  2009         The Production Group   
6    7  2010           Ballet Mississippi   
7    8  2010        Finesse Entertainment   
8    9  2010            Ballet Magnificat   
9   10  2010             Belhaven College   

                                       Name of Event        Place  \
0  R&B Concert featuring Tre' Williams, Calvin Ri...  Thalia Mara   
1                                            Rave On  Thalia Mara   
2                                Beethoven, The Hero  Thalia Mara   
3                                  Mississippi Sings  Thalia Mara   
4                                            Camelot  Thalia Mara   
5                                Arnez J Comedy Show  Thalia Mara   
6                    

## Clean the Data
Remove any `NA` values.

In [40]:
# Remove rows with any NaN values
clean_df = df.dropna().copy()

print(clean_df.head())

# Wrte new data frame to csv
clean_df.to_csv('data/clean_data.csv', index=False)

   _id  Date                       Leasee  \
0    1  2009         Gemini Entertainment   
1    2  2009              W. Kessler, LTD   
2    3  2009         Mississippi Symphony   
3    4  2009  Mississippi Music Educators   
4    5  2009              W. Kessler, LTD   

                                       Name of Event        Place      Type  \
0  R&B Concert featuring Tre' Williams, Calvin Ri...  Thalia Mara   Concert   
1                                            Rave On  Thalia Mara   Concert   
2                                Beethoven, The Hero  Thalia Mara  Symphony   
3                                  Mississippi Sings  Thalia Mara    Chorus   
4                                            Camelot  Thalia Mara   Musical   

     Revenue Attendance  
0  2550.0000       1000  
1  5100.0000       3800  
2  1150.0000       1300  
3   900.0000       4000  
4  5100.0000       4200  


In [17]:
# Ensure the 'Attendance' column is numeric
clean_df['Attendance'] = pd.to_numeric(clean_df['Attendance'], errors='coerce')

# Calculate the median of the 'Attendance' column
clean_df['Attendance'].median()

1600.0

## Load Bokeh to Visualize Data

In [48]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool

from bokeh.palettes import Set3_10
from bokeh.transform import factor_cmap
from bokeh.io import output_notebook, show
from bokeh.plotting import figure

# Output the plot to the notebook
# To save as html, import ouput_file from bokeh.plottig import output_file
# Use the code: output_file("revenue_barplot.html")
output_notebook()

# Convert Revenue column to floats to sum
clean_df['Revenue'] = clean_df['Revenue'].astype(float)
# Group data by Date
grouped = clean_df.groupby('Date')['Revenue'].sum()

print(grouped)

Date
2009     16050.00
2010     93638.49
2011     92237.32
2012     75251.00
2013     90547.43
2014     67016.00
2015    155918.31
2016    214761.20
2017    130920.18
2018    141346.25
Name: Revenue, dtype: float64


## Generate a Bar Plot Showing Jackson Event Revenue by Year

In [49]:
source = ColumnDataSource(pd.DataFrame(grouped))
dates = source.data['Date'].tolist()
p = figure(x_range=dates)
color_map = factor_cmap(field_name='Date', palette=Set3_10, factors=dates)

p.vbar(x='Date', top='Revenue', source=source, width=0.70, color=color_map)

p.title.text ='Jackson Event Revenue by Year'
p.xaxis.axis_label = 'Year'
p.yaxis.axis_label = 'Revenue'

show(p)

## Generate a Stacked Bar Plot

In [65]:
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Legend, LegendItem, Div
from bokeh.layouts import row, column
from colorsys import hsv_to_rgb

def generate_palette(n):
    return [
        '#%02x%02x%02x' % tuple(int(c * 255) for c in hsv_to_rgb(i / n, 0.7, 0.9))
        for i in range(n)
    ]

# Data prep
clean_df['Revenue'] = clean_df['Revenue'].astype(float)
pivot = clean_df.pivot_table(
    index='Date', columns='Type', values='Revenue',
    aggfunc='sum', fill_value=0
).reset_index()
pivot['Date'] = pivot['Date'].astype(str)

# Source and categories
source = ColumnDataSource(pivot)
years = pivot['Date'].tolist()
types = pivot.columns[1:].tolist()

# Custom palette
palette = generate_palette(len(types))
stack_colors = [palette[i] for i in range(len(types))]

# Main figure
p = figure(
    x_range=years,
    title="Stacked Revenue by Event Type (Custom Palette)",
    height=400,
    width=900
)

# Add stacked bars
renderers = p.vbar_stack(
    stackers=types,
    x='Date',
    width=0.8,
    color=stack_colors,
    source=source
)

# Axis labels
p.xaxis.axis_label = "Year"
p.yaxis.axis_label = "Revenue"

# Custom legend in separate figure
legend_fig = figure(
    toolbar_location=None,
    height=400,
    width=250,
    title="Event Type Legend",
    outline_line_color=None,
    min_border=0,
    x_range=(0, 1),
    y_range=(0, len(types)),
)

legend_fig.axis.visible = False
legend_fig.grid.visible = False

# Draw dummy colored boxes with labels
for i, (label, color) in enumerate(zip(types, stack_colors)):
    legend_fig.rect(
        x=0.1, y=len(types) - i - 1, width=0.1, height=0.8,
        color=color
    )
    legend_fig.text(
        x=0.25, y=len(types) - i - 1, text=[label],
        text_font_size="8pt", text_baseline="middle"
    )

# Layout
layout = row(p, legend_fig)
show(layout)