In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

file1 = 'Physician_Compare_2015_Group_Public_Reporting_-_Patient_Experience.csv'
file2 = 'Physician_Compare_2015_Group_Public_Reporting___Performance_Scores.csv'
file3 = 'Physician_Compare_2015_Individual_EP_Public_Reporting___Performance_Scores.csv'
file4 = 'Physician_Compare_National_Downloadable_File.csv'

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)
df4 = pd.read_csv(file4)

#Read the array of PAC ID in file3 and file4 which have info about individuals
x1=df3['NPI'].unique()
x2=df4['NPI'].unique()

#Identify the unique PAC ID in the two files
x = np.unique(np.concatenate((x1,x2),axis=0))

#Number of clinicians with unique PAC ID
N = len(x)
print('The number of clinicians with unique NPI=',N)

#Count the number of males and females
df4.groupby('Gender').count()
#Male_count = 1674452, female_count = 1280809
print('The ratio of male to female clinicians was calculated using the display manually')

#calculate the higest ratio of female to male clinicians with given credential
print('In this case I excluded credential SCW which has no male clinicians')

xx = df4.groupby(['Credential','Gender']).count()['NPI']
#print(xx)
yy_array = np.delete(xx_array,np.array([16,41])) #wanted to remove unwanted elements using brut force method
divide = []
for i in range(0,len(yy_array),2): #Take the ratio of each adjacent elements which is for female and male
    divide.append(yy_array[i]/yy_array[i+1])
print('Maximum ratio of female to male clinicians for a given cred.=',max(divide))
#In this case I excluded SCW which has no male clinicians

#The number of states with less than 10 health care facilities, I just looked two files separately
#to get the idea 
df_1 = df1.groupby( [' State'] ).count()
#df_1.head()
df_1[df_1['Group PAC ID'] < 10.0]

df_2 = df2.groupby( ['State'] ).count()
#df_2.head()
df_2[df_2['Group PAC ID'] < 10.0]
print('Count the states with less than 10 healthcare facilities from the display above, this was done manually')

#calculation of standard deviation of distribution of means 
#Calculate the average measure performance rate given in file3
df_3 = df3[df3['Measure Performance Rate']>10.0] #Take only values greater than 10
df_4 = df_3.groupby( ['NPI'] ).mean() #Take the means
df_4['Measure Performance Rate'].std() #Calculate the standard deviation

#Calculate the absolute difference in average performance rates between the MD and Nurse practitioners
#I need to merge two tables with some unique NPI for each MD and NP
df_5 = df4[df4['Credential']=='MD'] #Create a dataframe for the clinicians with credential MD and unique NPI from file4
df_6 = pd.merge(df_3,df_5,on='NPI',how='right') #left_on='NPI',right_on='NPI',how='left') #merge the two data frames with unique NPI

#Repeat same for clinicians with credential 'NP'
df_7 = df4[df4['Credential']=='NP']#Create a dataframe for the clinicians with credential NP and unique NPI
df_8 = pd.merge(df_3,df_7,on='NPI',how='right') #left_on='NPI',right_on='NPI',how='left')

df_9 = df_6[df_6['Credential']=='MD'] #create the data frame for MD only from merged datafram
df_10 = df_8[df_8['Credential']=='NP'] #Create the data frame for NP only from merged dataframe

print('Absolute difference in average performance rates between MD and NP is')
print(np.abs(df_9['Measure Performance Rate'].mean() - df_10['Measure Performance Rate'].mean()))

#perform the two sample t-test for MD and NP
MD_sample = np.asarray(df_9['Measure Performance Rate'].tolist())
NP_sample = np.asarray(df_10['Measure Performance Rate'].tolist())

#These arrays have nans,clean the data and make free from nans
MD_sample_clean = np.trim_zeros(np.sort(np.nan_to_num(MD_sample)))
NP_sample_clean = np.trim_zeros(np.sort(np.nan_to_num(NP_sample)))

#The arrays must have same shape, so check it first
print(len(MD_sample_clean),len(NP_sample_clean))

print(len(MD_sample_clean[:len(NP_sample_clean)]))

t_test = stats.ttest_ind(MD_sample_clean[:len(NP_sample_clean)],NP_sample_clean,equal_var=True)
print('t_test=',t_test)

#computation of linear regression of performance rates vs graduation year
#make a data frame for MDs graduated between 1973 and 2003 from file4
df_11 = df4[(df4['Credential']=='MD') & (pd.to_numeric(df4['Graduation year'])<=2003.0) & (pd.to_numeric(df4['Graduation year'])>=1973.0)]

#Now merge df_11 with df_3 using NPI as unique id's, df_3 has info with performance scores >10 only
df_12 = pd.merge(df_3,df_11,on='NPI',how='right')

#use groupby in df_12 for graduation year and find the means of performance rates
df_13 = df_12.groupby('Graduation year').mean()

#Make the arrays ready for computation of regression
MD_performances = np.array(df_13['Measure Performance Rate'].tolist())
graduation_year = np.arange(1973,2004,1)

linear_regression = stats.linregress(graduation_year,MD_performances)
print('linear regression=',linear_regression)