In [None]:
%matplotlib inline

States and Screeners
==================

Where a patient lives influences how likely they are to be screened 
for cervical cancer.

In [None]:
import numpy as np 
import pandas as pd 
import pylab as plt
import matplotlib as mpl
import sqlite3
from scipy import stats

In [None]:
#Collecting the data to show this...
db = sqlite3.connect('../input/database.sqlite')
sdf = pd.read_sql_query("SELECT patient_state, is_screener FROM patients_train;",db)

#Proportion of all of the patients in a state that are screeners
pdf = sdf.groupby('patient_state').apply(lambda x: x.is_screener.sum()/x.shape[0])
pdf.sort_values(inplace = True,ascending = False)

#Total number of patients in each state
portion_df = sdf.groupby('patient_state').size()
portion_df.sort_values(inplace = True)
#Convert this to numbers relative to the number of patients in the state with the least patients 
#in the training data which is ND
rel_portion_df = portion_df/portion_df.min()

#Also get the relative portion for the testing data (Note we can get away with min here because
#in both cases ND has the least number of patients)
tdf = pd.read_sql_query("SELECT patient_state FROM patients_test;",db)
test_portion_df = tdf.groupby('patient_state').size()
test_portion_df.sort_values(inplace = True)
rel_test_portion_df = test_portion_df/test_portion_df.min()


A simple look at the percentage of screeners by state shows us 
that if we can assume that the selection criterion for patients 
included in the data is equal across states then the probability 
that a patient is screened differs significantly across many states.

In [None]:
cm = plt.get_cmap('RdBu')
colors = [cm(x) for x in pdf.values]
pos = 1.5*np.arange(len(colors)) + 1.5
fig = plt.figure(figsize=(17,12))
ax = fig.add_subplot(111, axisbg='w', frame_on=False)
fig.suptitle('Percentage of screeners by state', fontsize=20)
#Get the 95% confidence intervals for error bars...            
conf_ints = [stats.binom.interval(.95, portion_df[x],pdf[x])/portion_df[x] for x in pdf.index]
errs = [100*(x[1]-x[0]) for x in conf_ints]
plt.barh(pos,100*pdf.values, height = 1.2, align = 'center', color = colors, tick_label = pdf.index,xerr = errs)
plt.show()

Which State Should We Choose?
--------------------------

If we could only choose one state in which launch a campaign to 
attempt to increase the screening rate, which state should we choose? 
The differing lengths of the confidence intervals indicate that number of 
patients sampled from each state vary widely. While current screening rate 
should be a factor, we should also take into account the relative population sizes.

To illustrate, Hawaii with the lowest screening rate at 38.2% has 4185 
patients in the training set. A 10% improvement in Hawaii's screening
rate would result in 419 more screeners in the trainig set. New York with 
the highest screening rate at 74.2% has 69314 patients in the training set 
meaning that a **1% improvement** in New York's would result in 693 more screeners
in the training set. 

Ideally we would want to choose a state with both a relatively large number of
patients and a relatively low screening rate. 


In [None]:
#Bar colors based on the screener rate
cm = plt.get_cmap('RdBu')
colors = [cm(pdf.loc[x]) for x in rel_portion_df.index]

#Plot the training data 
fig = plt.figure(figsize=(17,12))
ax = fig.add_subplot(121, axisbg='w', frame_on=False)
p1 = rel_portion_df.plot(kind='barh', color = colors)
t1 = plt.title('Training Data')
y1 = plt.ylabel('State')
#Adding in a color bar for the training data
num_colors = 20
step = (np.ceil(100*pdf.max()) - np.floor(100*pdf.min()))/(100*num_colors)
blues = [cm(np.floor(100*pdf.min())/100 + i*step) for i in range(num_colors)]
bounds = np.linspace(np.floor(100*pdf.min()), np.ceil(100*pdf.max()), num_colors)
cmap = mpl.colors.ListedColormap(blues)
ax2 = fig.add_axes([0.39, 0.2, 0.03, 0.5]) #[left,bottom,width,height]
cb = mpl.colorbar.ColorbarBase(ax2, cmap=cmap, ticks=bounds, boundaries=bounds,format='%1i',label='screener percentage')
cb.ax.set_yticklabels([str(round(i, 2)) for i in bounds])

#Plot the testing data using the average screener rate for all the training data to set the color
ax3 = fig.add_subplot(122, axisbg='w', frame_on=False)
p2 = rel_test_portion_df.loc[portion_df.index].plot(kind = 'barh',color = cm(sdf.is_screener.sum()/sdf.shape[0]))
t2 = plt.title('Test Data')
y2 = plt.ylabel('State')
t3 = fig.suptitle('Relative number of patients in each state, ND = 1',fontsize = 20)

Based on this criterion we could certainly argue for California, Washington, Florida, 
and Georgia all of which have relatively low screening rates (listed below) and relatively 
high populations.

In [None]:
print(pdf.loc[['CA','WA','FL','GA']])


Repeating the above plot for the training data using actual numbers of screeners 
instead of relative number of screeners we can get an idea of the scope for 
improvement.

In [None]:

fig = plt.figure(figsize=(17,12))
ax = fig.add_subplot(111, axisbg='w', frame_on=False)

cm = plt.get_cmap('RdBu')
colors = [cm(pdf.loc[x]) for x in portion_df.index]
portion_df.plot(kind='barh', figsize=(17, 12), grid=False, color=colors)
plt.title('Actual number of patients in the training data by state',fontsize = 20)
plt.ylabel('State')

num_colors = 20
step = (np.ceil(100*pdf.max()) - np.floor(100*pdf.min()))/(100*num_colors)
blues = [cm(np.floor(100*pdf.min())/100 + i*step) for i in range(num_colors)]
bounds = np.linspace(np.floor(100*pdf.min()), np.ceil(100*pdf.max()), num_colors)
cmap = mpl.colors.ListedColormap(blues)
ax2 = fig.add_axes([0.75, 0.2, 0.03, 0.5]) #[left,bottom,width,height]
cb = mpl.colorbar.ColorbarBase(ax2, cmap=cmap, ticks=bounds, boundaries=bounds,format='%1i',label='screener percentage')
cb.ax.set_yticklabels([str(round(i, 2)) for i in bounds])


plt.show()
