In [58]:
# filter/clean up data frame and write into filtered.csv
import pandas as pd
df = pd.read_csv('innocence_project_stats.csv')
col_names = list(df)
col_names

['Last Name',
 'First Name',
 'Age',
 'Race',
 'ST',
 'County of Crime',
 'Tags',
 'OM Tags',
 'Crime',
 'Sentence',
 'Convicted',
 'Exonerated',
 'DNA',
 'MWID',
 'FC',
 'P/FA',
 'F/MFE',
 'OM',
 'ILD']

In [59]:
# reorder and drop columns
df = df[['Last Name', 'First Name', 'Age', 'Race', 'Crime', 'Sentence', 'Convicted', 'Exonerated', 'DNA', "ST"]]
print(list(df))

['Last Name', 'First Name', 'Age', 'Race', 'Crime', 'Sentence', 'Convicted', 'Exonerated', 'DNA', 'ST']


In [60]:
# make 1 column for Name and remove the 2 name columns

df['Name'] = df.insert(0, 'Name', None)
df["Name"] = df["First Name"] + " " + df["Last Name"]
df = df.drop(columns=["First Name", "Last Name"])
df

Unnamed: 0,Name,Age,Race,Crime,Sentence,Convicted,Exonerated,DNA,ST
0,Joseph Abbitt,31.0,Black,Child Sex Abuse,Life,1995,2009,DNA,NC
1,Cinque Abbott,19.0,Black,Drug Possession or Sale,Probation,2008,2022,,IL
2,Warith Habib Abdal,43.0,Black,Sexual Assault,20 to Life,1983,1999,DNA,NY
3,Christopher Abernathy,17.0,White,Murder,Life without parole,1987,2015,DNA,IL
4,Quentin Abney,32.0,Black,Robbery,20 to Life,2006,2012,,NY
...,...,...,...,...,...,...,...,...,...
3321,Richard Zawacki,38.0,White,Child Sex Abuse,4 years,2000,2001,,IN
3322,Walter Zimmer,40.0,White,Manslaughter,50 years,1998,2011,DNA,OH
3323,Evan Zimmerman,53.0,White,Murder,Life,2001,2005,DNA,WI
3324,Tyrone Zinkiewicz,38.0,White,Other Nonviolent Felony,5 to 15 years,1988,1992,,OH


In [61]:
# make Age an int value (why float?)
df["Age"] = df["Age"].astype("Int8")

In [62]:
# Flag those with sentences containing Life

# new column for Life sentences (True/False)
df['Life'] = df.insert(5, 'Life', None)
print(list(df))

['Name', 'Age', 'Race', 'Crime', 'Sentence', 'Life', 'Convicted', 'Exonerated', 'DNA', 'ST']


In [63]:
df[df["ST"] == "L"]

Unnamed: 0,Name,Age,Race,Crime,Sentence,Life,Convicted,Exonerated,DNA,ST


In [64]:

for i,row in df.iterrows(): 
    #print (i, list(row))  # DEBUG: row has values for the row with the index i
    s = row["Sentence"] # sentence
    if 'life' in s.lower():
        df.loc[i, "Life"] = True   
    else:
        df.loc[i, "Life"] = False

df

Unnamed: 0,Name,Age,Race,Crime,Sentence,Life,Convicted,Exonerated,DNA,ST
0,Joseph Abbitt,31,Black,Child Sex Abuse,Life,True,1995,2009,DNA,NC
1,Cinque Abbott,19,Black,Drug Possession or Sale,Probation,False,2008,2022,,IL
2,Warith Habib Abdal,43,Black,Sexual Assault,20 to Life,True,1983,1999,DNA,NY
3,Christopher Abernathy,17,White,Murder,Life without parole,True,1987,2015,DNA,IL
4,Quentin Abney,32,Black,Robbery,20 to Life,True,2006,2012,,NY
...,...,...,...,...,...,...,...,...,...,...
3321,Richard Zawacki,38,White,Child Sex Abuse,4 years,False,2000,2001,,IN
3322,Walter Zimmer,40,White,Manslaughter,50 years,False,1998,2011,DNA,OH
3323,Evan Zimmerman,53,White,Murder,Life,True,2001,2005,DNA,WI
3324,Tyrone Zinkiewicz,38,White,Other Nonviolent Felony,5 to 15 years,False,1988,1992,,OH


In [65]:
# drop those with Life == "False"
df = df.drop(df[df["Life"] == False].index)
df

Unnamed: 0,Name,Age,Race,Crime,Sentence,Life,Convicted,Exonerated,DNA,ST
0,Joseph Abbitt,31,Black,Child Sex Abuse,Life,True,1995,2009,DNA,NC
2,Warith Habib Abdal,43,Black,Sexual Assault,20 to Life,True,1983,1999,DNA,NY
3,Christopher Abernathy,17,White,Murder,Life without parole,True,1987,2015,DNA,IL
4,Quentin Abney,32,Black,Robbery,20 to Life,True,2006,2012,,NY
11,Don Ray Adams,32,Black,Murder,Life,True,1992,2011,,PA
...,...,...,...,...,...,...,...,...,...,...
3306,Anthony Yarbough,18,Black,Murder,75 to life,True,1994,2014,DNA,NY
3310,Kenneth York,38,White,Sexual Assault,Life without parole,True,1994,2010,DNA,MO
3313,Paul Young,17,Black,Murder,Life without parole,True,1987,2021,,MI
3314,"Dan Young, Jr.",30,Black,Murder,Life without parole,True,1994,2005,DNA,IL


In [66]:
# Now also drop Life column as it's no longer needed
df = df.drop("Life", axis=1)

In [76]:
# drop entries w/o state entries
df = df.drop(df[df["ST"] == ""].index)

# substitute all the F- states
for i,row in df.iterrows(): 
    #print (i, list(row))  # DEBUG: row has values for the row with the index i
    s = row["ST"] # 
    if s[0:2] == "F-":
        df.loc[i, "ST"] = s[4:] 

df["ST"].unique()


array(['NC', 'NY', 'IL', 'PA', 'LA', 'CT', 'CA', 'OH', 'MS', 'TX', 'WA',
       'WI', 'MO', 'VA', 'MI', 'RI', 'OK', 'MD', 'TN', 'FL', 'NJ', 'MA',
       'WV', 'SC', 'NV', 'KS', 'OR', 'AR', 'UT', 'GA', 'MT', 'DC', 'IN',
       'PR', 'KY', 'AL', 'DE', 'CO', 'WY', 'NM', 'IA', 'AZ', 'VT', 'NE',
       'HI', 'ID'], dtype=object)

In [77]:
# add a new column with duration (Exonerated - Convicted)
df['Duration'] = df.insert(5, 'Duration', 0)
df["Duration"] = df["Exonerated"] - df["Convicted"]
df

ValueError: cannot insert Duration, already exists

In [69]:
# For DNA use True and False, not DNA and NaN 
for i,row in df.iterrows(): 
    #print (i, list(row))  # DEBUG: row has values for the row with the index i
    if row["DNA"] == "DNA":
        df.loc[i, "DNA"] = True   # short for  blue
    else:
        df.loc[i, "DNA"] = False
df

Unnamed: 0,Name,Age,Race,Crime,Sentence,Duration,Convicted,Exonerated,DNA,ST
0,Joseph Abbitt,31,Black,Child Sex Abuse,Life,14,1995,2009,True,NC
2,Warith Habib Abdal,43,Black,Sexual Assault,20 to Life,16,1983,1999,True,NY
3,Christopher Abernathy,17,White,Murder,Life without parole,28,1987,2015,True,IL
4,Quentin Abney,32,Black,Robbery,20 to Life,6,2006,2012,False,NY
11,Don Ray Adams,32,Black,Murder,Life,19,1992,2011,False,PA
...,...,...,...,...,...,...,...,...,...,...
3306,Anthony Yarbough,18,Black,Murder,75 to life,20,1994,2014,True,NY
3310,Kenneth York,38,White,Sexual Assault,Life without parole,16,1994,2010,True,MO
3313,Paul Young,17,Black,Murder,Life without parole,34,1987,2021,False,MI
3314,"Dan Young, Jr.",30,Black,Murder,Life without parole,11,1994,2005,True,IL


In [78]:
# re-build a 0 to N continuous index
df = df.reset_index(drop=True)
df.index.name = "#"
df

Unnamed: 0_level_0,Name,Age,Race,Crime,Sentence,Duration,Convicted,Exonerated,DNA,ST
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Joseph Abbitt,31,Black,Child Sex Abuse,Life,14,1995,2009,True,NC
1,Warith Habib Abdal,43,Black,Sexual Assault,20 to Life,16,1983,1999,True,NY
2,Christopher Abernathy,17,White,Murder,Life without parole,28,1987,2015,True,IL
3,Quentin Abney,32,Black,Robbery,20 to Life,6,2006,2012,False,NY
4,Don Ray Adams,32,Black,Murder,Life,19,1992,2011,False,PA
...,...,...,...,...,...,...,...,...,...,...
1018,Anthony Yarbough,18,Black,Murder,75 to life,20,1994,2014,True,NY
1019,Kenneth York,38,White,Sexual Assault,Life without parole,16,1994,2010,True,MO
1020,Paul Young,17,Black,Murder,Life without parole,34,1987,2021,False,MI
1021,"Dan Young, Jr.",30,Black,Murder,Life without parole,11,1994,2005,True,IL


In [71]:
# Save to new file
df.to_csv("filtered.csv")

In [84]:
# Aggregate (group) by ST:  

# https://pandas.pydata.org/pandas-docs/stable/getting_started/intro_tutorials/06_calculate_statistics.html#aggregating-statistics-grouped-by-category

# note that ST will be the index
ST_count = df[["ST","Name"]].groupby("ST").count()
ST_count = ST_count.rename(columns={"Name": "Count"})

ST_durantion_mean = df[["ST","Duration"]].groupby("ST").mean().round(1)
# more

# make dataframe from above
ST_df = pd.concat([ST_count,ST_durantion_mean], axis=1)
ST_df

Unnamed: 0_level_0,Count,Duration
ST,Unnamed: 1_level_1,Unnamed: 2_level_1
AL,9,10.1
AR,5,15.4
AZ,1,9.0
CA,123,14.2
CO,6,8.5
CT,4,17.2
DC,13,29.3
DE,1,38.0
FL,33,17.6
GA,32,15.5


In [1]:
# Run the graphing app inside jupyter

# Import packages
from dash import Dash, html, dash_table, dcc, callback, Output, Input
import pandas as pd
import plotly.express as px

# Incorporate data
df = pd.read_csv('filtered.csv')
num_rows = len(df)

# Initialize the app
app = Dash(__name__)

# App layout
app.layout = html.Div([
    html.Div(children=f'Total of {num_rows} entries, select a Histogram ',  style={'color': 'Gold', 'font-size': 20}),
    html.Hr(),

    # CH: value must be an existing column, you had lifeExp which is not a valid column name
    dcc.RadioItems(options=['Race', 'Crime', 'ST', 'Age', 'Duration', 'DNA' ], value='Race', 
                   inline=True, 
                   id='controls-and-radio-item',
                   style={'color': 'Gold', 'font-size': 20}),
    dcc.Graph(figure={}, id='controls-and-graph')
])

# Add controls to build the interaction
@callback(
    Output(component_id='controls-and-graph', component_property='figure'),
    Input(component_id='controls-and-radio-item', component_property='value')
)


def update_graph(col_chosen):
    fig = px.histogram(df, x=col_chosen)
    return fig

def updateTable(n):
     pass

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)
