In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
# some basic imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy import stats
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import seaborn as sns

  import pandas.util.testing as tm


In [None]:
# After executing the cell above, Drive
# files will be present in "/content/drive/My Drive".
!ls "/content/drive/My Drive/Jordan/Data"

all_data_takaful_and_commercial.dta  Matched_drugs5.csv
choice_drug_not_matched1.csv	     new_choice_drugs.csv
choice_drug_not_matched2.csv	     Not_matched_drugs2.csv
classes.dta			     Not_matched_drugs3.csv
combined_data.dta		     outcome.csv
drug_data.dta			     percentile_scores2.csv
Drug_search.xlsx		     s_stats.csv
final_data.csv			     transformed_data5.csv
Matched_drugs2.gsheet		     unique_drug_freq.csv
Matched_drugs4.csv


In [None]:
#changing the directory to the data directory
%cd /content/drive/My\ Drive/Jordan/Data

/content/drive/My Drive/Jordan/Data


In [None]:
class DataShell:
  def __init__(self):
    """Initializes the class with some information about the data and the model"""
    self.data_used = 'The data used is the transformed insurance data'
    self.model_used = 'The model used is PRIDIT to obtain scores for the claims and weights for the variables'

  def read_data(self,filename):
    """Reads the data from the current directory
      parameters: filename in the directory
      returns: Dataframe of the data
    """
    self.data = pd.read_csv(filename)
    return self.data

  def get_initial_data(self):
    """Returns the initial dataframe in its current form"""
    return self.data

  def get_transformed_data(self):
    """Returns the transformed data in its current form"""
    return self.data1

  def feature_selection(self):
    """Selects from initial dataframe the columns that will be used for the PCA model.
       The numbers represent the column number from the initial dataframe that are selected"""
    self.data['company_key'] = [1 if company == 0.0 else 2 for company in self.data['company_key']]
    self.data1 = self.data.iloc[:,[1,3,4,7,8,9,12,13,14,15,16,17,18,20,21,22,23,25,26]]
    return self.data1

  def get_cont_vars(self):
    """returns: a dataframe of the continuous variables"""
    self.cont = self.data1.iloc[:,[10,11,12,15,16]]
    return self.cont

  def get_cat_vars(self):
    """returns: a dataframe of the categorical variables"""
    self.cat = self.data1.iloc[:,[0,1,2,3,4,5,6,7,8,9,14,17,18]]
    self.cat_cols = list(self.cat.columns)
    return self.cat

  def convert_to_RIDIT(self):
    """Converts the categorical variables into RIDIT scores
       returns: The RIDIT transformed dataframe"""
    for col in self.cat_cols:
      temp = self.cat[col].value_counts().to_dict()
      temp = {k:v/len(self.cat) for k,v in temp.items()}
      temp_RIDIT = {key:0 for key,v in temp.items()}
      for i in range(min(self.cat[col].unique()),max(self.cat[col].unique())+1):
        for j in range(min(self.cat[col].unique()),max(self.cat[col].unique())+1):
          if i == j:
            pass
          elif i > j:
            temp_RIDIT[i] = temp_RIDIT[i] + temp[j]
          else:
            temp_RIDIT[i] = temp_RIDIT[i] - temp[j]
      self.cat[col] = self.cat[col].replace(temp_RIDIT)
    return self.cat

  def transform_comb_data(self):
    """Standardizes the entire dataframe
       returns: the standardized dataframe"""
    self.comb_temp = StandardScaler().fit_transform(self.concat_df1)
    self.comb_temp = pd.DataFrame(self.comb_temp,columns = self.concat_df1.columns)
    return self.comb_temp

  def PCA_sub_data(self,data):
    """Does PCA analysis for the sub-groups
       parameters: 
              data: the dataframe to undergo PCA
       returns: weights of the variables, scores of the claim files from the analysis"""
    self.tr_data = StandardScaler().fit_transform(data)
    self.tr_data = pd.DataFrame(self.tr_data,columns = data.columns)
    self.pca = PCA(n_components=1)
    self.pcs = self.pca.fit_transform(self.tr_data)
    self.scores = pd.DataFrame(self.pcs,columns=['Scores'])
    self.weights = pd.DataFrame(self.pca.components_,columns=data.columns)
    return self.weights,self.scores

  def concat_df1(self):
    """Concatenates the categorical and continuous dataframes
       returns: the concatenated dataframe"""
    self.concat_df1 = pd.concat([self.cat,self.cont],axis=1)
    return self.concat_df1

  def sep_takaful_data(self):
    """Separates the commercial and takaful dataframes for sub-analysis
       returns: the commercial dataframe, takaful dataframe"""
    #self.complete_data = pd.concat([self.data['company_key'],self.concat_df1],axis=1)
    self.commercial_data = self.concat_df1[self.concat_df1['company_key'] < 0]
    self.commercial_data = self.commercial_data.drop(columns=['company_key'])
    self.takaful_data = self.concat_df1[self.concat_df1['company_key'] > 0]
    self.takaful_data = self.takaful_data.drop(columns=['company_key'])
    return self.commercial_data,self.takaful_data

  def sep_religion_data(self):
    """Separates the muslim and non muslim dataframes for sub-analysis
       returns: the muslim, non-muslim dataframes"""
    self.muslim_data = self.concat_df1[self.concat_df1['religion'] <0]
    self.muslim_data = self.muslim_data.drop(columns=['religion'])
    self.non_muslim_data = self.concat_df1[self.concat_df1['religion'] >0]
    self.non_muslim_data = self.non_muslim_data.drop(columns=['religion'])
    return self.muslim_data,self.non_muslim_data
  
  def sep_visittype_data(self):
    """Separates the inpatient and outpatient dataframes
       returns: the inpatient, outpatient dataframes"""
    self.inpat_data = self.concat_df1[(self.concat_df1['visittype_fob']>-0.86)&(self.concat_df1['visittype_fob']<-0.85)]
    self.inpat_data = self.inpat_data.drop(columns=['visittype_fob'])
    self.outpat_data = self.concat_df1[self.concat_df1['visittype_fob'] > 0.07]
    self.outpat_data = self.outpat_data.drop(columns=['visittype_fob'])
    return self.inpat_data,self.outpat_data

  def sep_developement_data(self):
    """Separates the developed and undeveloped dataframes
       returns: the undeveloped dataframe, developed dataframe"""
    self.undev_data = self.concat_df1[self.concat_df1['developement']<0]
    self.undev_data = self.undev_data.drop(columns=['developement'])
    self.dev_data = self.concat_df1[self.concat_df1['developement']>0]
    self.dev_data = self.dev_data.drop(columns=['developement'])
    return self.undev_data,self.dev_data

  def plot_hist(self,data,category):
    """Plots the histogram for PRIDIT scores
       parameters:
              data: the scores to be plotted
              category: the category the scores belong to
    """
    self.Stan_scores = StandardScaler().fit_transform(data)
    plt.hist(self.Stan_scores,range=[-3,3],bins=20)
    plt.xlabel('PRIDIT scores')
    plt.ylabel('frequency')
    plt.title('Histogram for {} scores'.format(category))

  def summary_stats(self,data):
    """Creates summary stats for the scores
       parameters:
              data: the scores for which summary stats are produced
       returns: a dataframe of the summary stats"""
    #self.Stan_scores = StandardScaler().fit_transform(data)
    self.summary_df = pd.DataFrame(data.describe())
    return self.summary_df

  def sep_data_for_ttest(self,data,threshold):
    """Separates data for suspicious claims analysis
       parameters:
              data: The subgroup scores which need to be separated
              threshold: The threshold used for separation
       returns: a dataframe separated based on threshold value"""
    return data[data['Scores']<threshold]

  def statistical_sig_test(self,data1,data2):
    """Standarizes the scores and performs a t-test
       parameters:
              data1: 1st subgroup to be compared
              data2: The subgroup to be compared to"""
    self.Stan_scores1 = StandardScaler().fit_transform(data1)
    self.Stan_scores2 = StandardScaler().fit_transform(data2)
    print(stats.ttest_ind(self.Stan_scores1,self.Stan_scores2))

  def classify_scores(self,data):
    """Classifies the scores based on thresholds
       parameters:
              data: the dataframe containing the scores to be classified"""
    bins = [-np.inf,-2,-1,0,np.inf]
    names = [1,2,3,4]
    data['class'] = pd.cut(data['Scores'],bins=bins,labels=names)
    return data

  def suspicious_claims_stats(self,data,category):
    df = pd.DataFrame(np.array([[len(data),0.1*len(data[data['Scores']<-1]),len(data[data['Scores']<0])]]),columns=['Data size','Size of 10% data','Scores <0 size'],index=[category])
    return df

  def regression_feature_selection(self):
    self.data_reg = self.data.iloc[:,[3,4,7,8,9,12,13,14,15,16,17,18,20,21,22,23,24,25,26]]
    return self.data_reg

  def remove_na(self):
    self.data_reg = self.data_reg[self.data_reg['SettlemenPeriod'].notna()]
    self.label = self.data_reg.iloc[:,[16]]
    self.label.reset_index(drop=True, inplace=True)
    return self.data_reg

  def get_cont_vars_reg(self):
    """returns: a dataframe of the continuous variables"""
    self.cont_reg = self.data_reg.iloc[:,[9,10,11,14,15]]
    return self.cont_reg

  def get_cat_vars_reg(self):
    """returns: a dataframe of the categorical variables"""
    self.cat_reg = self.data_reg.iloc[:,[0,1,2,3,4,5,6,7,8,13,17,18]]
    self.cat_cols_reg = list(self.cat_reg.columns)
    return self.cat_reg

  def convert_to_RIDIT_reg(self):
    """Converts the categorical variables into RIDIT scores
       returns: The RIDIT transformed dataframe"""
    for col in self.cat_cols_reg:
      temp1 = self.cat_reg[col].value_counts().to_dict()
      temp1 = {k:v/len(self.cat_reg) for k,v in temp1.items()}
      temp_RIDIT1 = {key:0 for key,v in temp1.items()}
      for i in sorted(self.cat_reg[col].unique()):
        for j in sorted(self.cat_reg[col].unique()):
          if i == j:
            pass
          elif i > j:
            temp_RIDIT1[i] = temp_RIDIT1[i] + temp1[j]
          else:
            temp_RIDIT1[i] = temp_RIDIT1[i] - temp1[j]
      self.cat_reg[col] = self.cat_reg[col].replace(temp_RIDIT1)
    return self.cat_reg

  def concat_df1_reg(self):
    """Concatenates the categorical and continuous dataframes
       returns: the concatenated dataframe"""
    self.concat_df1_reg = pd.concat([self.cat_reg,self.cont_reg],axis=1)
    return self.concat_df1_reg

  def transform_comb_data_reg(self):
    """Standardizes the entire dataframe
       returns: the standardized dataframe"""
    self.comb_temp_reg = StandardScaler().fit_transform(self.concat_df1_reg)
    self.comb_temp_reg = pd.DataFrame(self.comb_temp_reg,columns = self.concat_df1_reg.columns)
    self.comb_temp_reg = pd.concat([self.comb_temp_reg,self.label],axis=1)
    return self.comb_temp_reg

  def get_percentile(self,data,category):
    k = data.sort_values(['Scores'])
    k.reset_index(inplace=True,drop=True)
    df_s = pd.DataFrame(k.loc[int(len(k)*0.1),'Scores'],index=[category],columns=['0.1%'])
    df_l = pd.DataFrame(k.loc[int(len(k)*0.9),'Scores'],index=[category],columns=['0.9%'])
    return pd.concat([df_s,df_l],axis=1)

  def plot_hist_groups(self,data1,data2,category1,category2):
    plt.hist(np.array(data1),range=[-5,5],bins=20,alpha=0.5,label = category1)
    plt.hist(np.array(data2),bins=20,range=[-5,5],alpha=0.5,label = category2)
    plt.xlabel('PRIDIT scores')
    plt.ylabel('Frequency')
    plt.legend(loc='upper right')
    plt.title('Histogram for {} vs {}'.format(category1,category2))

  def plot_kde_groups(self,data1,data2,category1,category2):
    sns.kdeplot(data1.Scores,shade=True,clip=(-5,5),label=category1)
    sns.kdeplot(data2.Scores,shade=True,clip=(-5,5),label=category2)
    plt.xlabel('PRIDIT scores')
    plt.ylabel('Density')
    plt.legend(loc='upper right')
    plt.title('Histogram for {} vs {}'.format(category1,category2))

  



In [None]:
#Creates an object for the class and reads the data from the directory
data_shell = DataShell()
data = data_shell.read_data('transformed_data5.csv')
#data.head()
print("The data has ",data.shape[0]," rows and ",data.shape[1]," atrributes")

The data has  633042  rows and  27  atrributes


  if self.run_code(code, result):


In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,company_key,beneficiary,gender,maritalstatus,age,nationality,dependency,benef_status,networktype,reportingauthority,los,visittype_fob,chronic,claimtype,providertype,totalcimaiedam,totalpayable,ben_share_approved,icdchapter,college_degree,religion,policy_length,LStay,SettlemenPeriod,developement,age_group
0,0,1.0,Hatem Wahid Fawzy,1,1,54.0,Egypt,1,1,4,HAAD,,4,2,5,3,247.0,197.0,50.0,Diseases of the digestive system (ICD9CM 520-5...,yes,1,366,0.0,,1,3
1,1,1.0,Hatem Wahid Fawzy,1,1,54.0,Egypt,1,1,4,HAAD,,4,2,1,3,725.94,540.75,185.19,"Symptoms, signs and abnormal clinical and labo...",yes,1,366,0.0,10.0,1,3
2,2,1.0,Hatem Wahid Fawzy,1,1,54.0,Egypt,1,1,4,HAAD,,4,2,5,1,93.1,93.099998,0.0,Diseases of the genitourinary system (ICD9CM 5...,yes,1,366,0.0,81.0,1,3
3,3,1.0,Hatem Wahid Fawzy,1,1,54.0,Egypt,1,1,4,HAAD,,4,2,1,2,540.0,0.0,0.0,"Injury, poisoning and certain other consequenc...",yes,1,366,0.0,,1,3
4,4,1.0,Hatem Wahid Fawzy,1,1,54.0,Egypt,1,1,4,HAAD,,4,2,1,2,2178.0,0.0,0.0,"Injury, poisoning and certain other consequenc...",yes,1,366,0.0,,1,3


In [None]:
data.company_key.value_counts()

In [None]:
#Validation of results with length of stay and settlement period
data_reg = data_shell.regression_feature_selection()
data_reg = data_shell.remove_na()
data_cat_reg = data_shell.get_cat_vars_reg()
data_cont_reg = data_shell.get_cont_vars_reg()
data_catrid_reg = data_shell.convert_to_RIDIT_reg()
comb_df_reg = data_shell.concat_df1_reg()
new_df_reg = data_shell.transform_comb_data_reg()
new_df_reg.head()

In [None]:
#Linear regression model for validation 
X = new_df_reg.iloc[:,new_df_reg.columns!='SettlemenPeriod']
Y = new_df_reg.iloc[:,17]

In [None]:
mod = sm.OLS(Y,X)
fii = mod.fit()
fii.summary()

In [None]:
#Correlation between settlement period and scores
val_weights,val_scores = data_shell.PCA_sub_data(comb_df_reg)
val_scores['Scores'].corr(new_df_reg['SettlemenPeriod'])

In [None]:
k = pd.concat([val_scores['Scores'],new_df_reg['SettlemenPeriod']],axis=1)
k

In [None]:
#Plotting the Scores against the SettlemenPeriod
sns.regplot(x='Scores',y='SettlemenPeriod',data=k)
plt.xlim(-1,1)

In [None]:
#Feature selection done for the PCA model
#Company key and name not included too
#variables not included = settlemenperiod,icdgroups and college_degree
data1 = data_shell.feature_selection()
data1.head()

Unnamed: 0,company_key,gender,maritalstatus,dependency,benef_status,networktype,visittype_fob,chronic,claimtype,providertype,totalcimaiedam,totalpayable,ben_share_approved,college_degree,religion,policy_length,LStay,developement,age_group
0,2,1,1,1,1,4,4,2,5,3,247.0,197.0,50.0,yes,1,366,0.0,1,3
1,2,1,1,1,1,4,4,2,1,3,725.94,540.75,185.19,yes,1,366,0.0,1,3
2,2,1,1,1,1,4,4,2,5,1,93.1,93.099998,0.0,yes,1,366,0.0,1,3
3,2,1,1,1,1,4,4,2,1,2,540.0,0.0,0.0,yes,1,366,0.0,1,3
4,2,1,1,1,1,4,4,2,1,2,2178.0,0.0,0.0,yes,1,366,0.0,1,3


In [None]:
#The categorical and continuous data is separated
data_cont = data_shell.get_cont_vars()
data_cat = data_shell.get_cat_vars()

In [None]:
#The cateforical data is converted to RIDIT scores
data_cat = data_shell.convert_to_RIDIT()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
comb_df = data_shell.concat_df1()   #The RIDIT transformed variables and the continuous variables are joined
new_df = data_shell.transform_comb_data()   #The combined dataframe is standardized

In [None]:
new_df.columns = ['company_key1','gender1','maritalstatus1','dependency1','benefstatus1','networktype1','visittype_fob1','chronic1','claimtype1','providertype1','religion1',
                  'developement1','age_group1','totalclaimedam1','totalpayable1','ben_share_approved1','policy_length1','LStay1']
new_df.head()

Unnamed: 0,company_key1,gender1,maritalstatus1,dependency1,benefstatus1,networktype1,visittype_fob1,chronic1,claimtype1,providertype1,religion1,developement1,age_group1,totalclaimedam1,totalpayable1,ben_share_approved1,policy_length1,LStay1
0,0.349213,-0.726302,-0.659193,-0.753145,-0.236537,0.5594,0.288772,0.423043,0.282281,1.33555,-0.575527,-0.307386,0.798605,-0.0844,-0.070666,0.082549,0.113339,-0.037852
1,0.349213,-0.726302,-0.659193,-0.753145,-0.236537,0.5594,0.288772,0.423043,-3.752478,1.33555,-0.575527,-0.307386,0.798605,0.138975,0.010456,0.544375,0.113339,-0.037852
2,0.349213,-0.726302,-0.659193,-0.753145,-0.236537,0.5594,0.288772,0.423043,0.282281,-1.20402,-0.575527,-0.307386,0.798605,-0.156178,-0.095185,-0.088257,0.113339,-0.037852
3,0.349213,-0.726302,-0.659193,-0.753145,-0.236537,0.5594,0.288772,0.423043,-3.752478,0.13153,-0.575527,-0.307386,0.798605,0.052254,-0.117156,-0.088257,0.113339,-0.037852
4,0.349213,-0.726302,-0.659193,-0.753145,-0.236537,0.5594,0.288772,0.423043,-3.752478,0.13153,-0.575527,-0.307386,0.798605,0.81621,-0.117156,-0.088257,0.113339,-0.037852


In [None]:
#PCA model is executed
pca1 = PCA(n_components=1)
PCs1 = pca1.fit_transform(new_df)
Scores1 = pd.DataFrame(data = PCs1, columns = ['Scores'])   #Scores for the entire data is obtained

In [None]:
weights1 = pd.DataFrame(pca1.components_,columns=new_df.columns)  #weights for variables for the entire data is obtained
#weights1

In [None]:
final_df = pd.concat([data,new_df,Scores1],axis=1)
#final_df.head()

In [None]:
final_df1 = pd.concat([data,comb_df,Scores1],axis=1)
#final_df.to_stata('combined_data_RIDITScores.dta')

In [None]:
#final_df1.to_csv('combined_data_RIDITScores.csv')
#final_df.to_stata('combined_data.dta')

In [None]:
final_df_scores  = pd.concat([new_df,Scores1],axis=1)
#final_df_scores.to_stata('Standardized_variables.dta')

In [None]:
final_df_classes = data_shell.classify_scores(final_df_scores)
#final_df_classes

In [None]:
"""pos=0
neg=0
for i in range(len(non_muslim_Stan_scores)):
  if non_muslim_Stan_scores[i]>=0:
    pos += 1
  if non_muslim_Stan_scores[i]<0:
    neg += 1
pos_prop = pos/len(non_muslim_Stan_scores)*100
neg_prop = neg/len(non_muslim_Stan_scores)*100"""

In [None]:
#import seaborn as sns
#sns.kdeplot(Scores1.Scores,clip=(-3,3))

In [None]:
#Sub Analysis for muslims vs non-muslims
muslim_data,non_muslim_data = data_shell.sep_religion_data()
muslim_weights,muslim_scores = data_shell.PCA_sub_data(muslim_data)
non_muslim_weights,non_muslim_scores = data_shell.PCA_sub_data(non_muslim_data)

In [None]:
#Sub Analysis for takaful v commercial
commercial,takaful = data_shell.sep_takaful_data()
commercial_weights,commercial_scores = data_shell.PCA_sub_data(commercial)
takaful_weights,takaful_scores = data_shell.PCA_sub_data(takaful)

In [None]:
#Sub Analysis for inpatient v outpatient
inpat,outpat = data_shell.sep_visittype_data()
inpat_weights,inpat_scores = data_shell.PCA_sub_data(inpat)
outpat_weights,outpat_scores = data_shell.PCA_sub_data(outpat)

In [None]:
#Sub Analysis for developed v developing
undev,dev = data_shell.sep_developement_data()
undev_weights,undev_scores = data_shell.PCA_sub_data(undev)
dev_weights,dev_scores = data_shell.PCA_sub_data(dev)

In [None]:
#Cleaning up of the weights dataframe

outpat_weights['variable'] = 'Outpatient weights'
outpat_weights['visittype_fob'] = '-'
inpat_weights['variable'] = 'Inpatient weights'
inpat_weights['visittype_fob'] = '-'

muslim_weights['variable'] = 'Muslim weights'
muslim_weights['religion'] = '-'
non_muslim_weights['variable'] = 'Non Muslim weights'
non_muslim_weights['religion'] = '-'

takaful_weights['variable'] = 'Takaful weights'
takaful_weights['company_key'] = '-'
commercial_weights['variable'] = 'Commercial weights'
commercial_weights['company_key'] = '-'

dev_weights['variable'] = 'Developed country weights'
dev_weights['developement'] = '-'
undev_weights['variable'] = 'Undeveloped country weights'
undev_weights['developement'] = '-'

In [None]:
weights1['variable'] = 'All data'
weights1.columns = ['company_key','gender','maritalstatus','dependency','benef_status','networktype','visittype_fob','chronic','claimtype','providertype','religion',
                  'developement','age_group','totalcimaiedam','totalpayable','ben_share_approved','policy_length','LStay','variable']

In [None]:
final_weights = pd.concat([weights1,muslim_weights,non_muslim_weights,takaful_weights,commercial_weights,
                           outpat_weights,inpat_weights,dev_weights,undev_weights],axis = 0)
#final_weights

In [None]:
final_weights = final_weights.set_index('variable')
final_weights1 = final_weights.T

In [None]:
#Summary Statistics
#data_shell.summary_stats(undev_scores)
Summaries = pd.concat([data_shell.summary_stats(muslim_scores),data_shell.summary_stats(non_muslim_scores),
                       data_shell.summary_stats(dev_scores),data_shell.summary_stats(undev_scores),
                       data_shell.summary_stats(commercial_scores),data_shell.summary_stats(takaful_scores),
                       data_shell.summary_stats(inpat_scores),data_shell.summary_stats(outpat_scores),
                       data_shell.summary_stats(Scores1)],axis=1)
#Summaries

In [None]:
Summaries.columns = ['Muslim Scores','Non Muslim scores','Developed Scores','Undeveloped Scores',
                     'Commercial Scores','Takaful Scores','Inpatient Scores','Outpatient Scores','All data']
#Summaries

In [None]:
#t-test for the entire group(including suspicious and non-suspicious scores)
data_shell.statistical_sig_test(muslim_scores,non_muslim_scores)
data_shell.statistical_sig_test(takaful_scores,commercial_scores)
data_shell.statistical_sig_test(dev_scores,undev_scores)
data_shell.statistical_sig_test(outpat_scores,inpat_scores)

In [None]:
#Suspicious claim analysis for muslim vs non muslims
muslim_zero_scores = data_shell.sep_data_for_ttest(muslim_scores,0)
muslim_one_scores = data_shell.sep_data_for_ttest(muslim_scores,-1)
#muslim_two_scores = data_shell.sep_data_for_ttest(muslim_scores,-2)
non_muslim_zero_scores = data_shell.sep_data_for_ttest(non_muslim_scores,0)
non_muslim_one_scores = data_shell.sep_data_for_ttest(non_muslim_scores,-1)
#non_muslim_two_scores = data_shell.sep_data_for_ttest(non_muslim_scores,-2)
data_shell.statistical_sig_test(muslim_zero_scores,non_muslim_zero_scores)
data_shell.statistical_sig_test(muslim_one_scores,non_muslim_one_scores)
#data_shell.statistical_sig_test(muslim_two_scores,non_muslim_two_scores)

In [None]:
#Suspicious claim analysis for takaful vs commercial
takaful_zero_scores = data_shell.sep_data_for_ttest(takaful_scores,0)
#takaful_one_scores = data_shell.sep_data_for_ttest(takaful_scores,-1)
#takaful_two_scores = data_shell.sep_data_for_ttest(takaful_scores,-2)
commercial_zero_scores = data_shell.sep_data_for_ttest(commercial_scores,0)
#commercial_one_scores = data_shell.sep_data_for_ttest(commercial_scores,-1)
#commercial_two_scores = data_shell.sep_data_for_ttest(commercial_scores,-2)
data_shell.statistical_sig_test(takaful_zero_scores,commercial_zero_scores)
#data_shell.statistical_sig_test(takaful_one_scores,commercial_one_scores)
#data_shell.statistical_sig_test(takaful_two_scores,commercial_two_scores)

In [None]:
#Suspicious claim analysis for developed vs undeveloped
dev_zero_scores = data_shell.sep_data_for_ttest(dev_scores,0)
dev_one_scores = data_shell.sep_data_for_ttest(dev_scores,-1)
#dev_two_scores = data_shell.sep_data_for_ttest(dev_scores,-2)
undev_zero_scores = data_shell.sep_data_for_ttest(undev_scores,0)
undev_one_scores = data_shell.sep_data_for_ttest(undev_scores,-1)
#undev_two_scores = data_shell.sep_data_for_ttest(undev_scores,-2)
data_shell.statistical_sig_test(dev_zero_scores,undev_zero_scores)
data_shell.statistical_sig_test(dev_one_scores,undev_one_scores)
#data_shell.statistical_sig_test(dev_two_scores,undev_two_scores)

In [None]:

#Suspicious claim analysis for in-patient vs out-patient
inpat_zero_scores = data_shell.sep_data_for_ttest(inpat_scores,0)
inpat_one_scores = data_shell.sep_data_for_ttest(inpat_scores,-1)
#inpat_two_scores = data_shell.sep_data_for_ttest(inpat_scores,-2)
outpat_zero_scores = data_shell.sep_data_for_ttest(outpat_scores,0)
outpat_one_scores = data_shell.sep_data_for_ttest(outpat_scores,-1)
#outpat_two_scores = data_shell.sep_data_for_ttest(outpat_scores,-2)
data_shell.statistical_sig_test(inpat_zero_scores,outpat_zero_scores)
data_shell.statistical_sig_test(inpat_one_scores,outpat_one_scores)
#data_shell.statistical_sig_test(inpat_two_scores,outpat_two_scores)

In [None]:
#Creating summary stats for suspicious claims
mus_sus = data_shell.suspicious_claims_stats(muslim_scores,'muslims')
non_mus_sus = data_shell.suspicious_claims_stats(non_muslim_scores,'non-muslims')
takaful_sus = data_shell.suspicious_claims_stats(takaful_scores,'takaful')
commercial_sus = data_shell.suspicious_claims_stats(commercial_scores,'commercial')
dev_sus = data_shell.suspicious_claims_stats(dev_scores,'developed')
undev_sus = data_shell.suspicious_claims_stats(undev_scores,'undeveloped')
inpat_sus = data_shell.suspicious_claims_stats(inpat_scores,'inpatient')
outpat_sus = data_shell.suspicious_claims_stats(outpat_scores,'outpatient')
all_sus = data_shell.suspicious_claims_stats(Scores1,'All data')
df_sus = pd.concat([mus_sus,non_mus_sus,takaful_sus,commercial_sus,dev_sus,undev_sus,inpat_sus,outpat_sus,all_sus],axis=0)
#df_sus

In [None]:
#Creating summary stats for suspicious claims
mus_p = data_shell.get_percentile(muslim_scores,'muslims')
non_mus_p = data_shell.get_percentile(non_muslim_scores,'non-muslims')
takaful_p = data_shell.get_percentile(takaful_scores,'takaful')
commercial_p = data_shell.get_percentile(commercial_scores,'commercial')
dev_p = data_shell.get_percentile(dev_scores,'developed')
undev_p = data_shell.get_percentile(undev_scores,'undeveloped')
inpat_p = data_shell.get_percentile(inpat_scores,'inpatient')
outpat_p = data_shell.get_percentile(outpat_scores,'outpatient')
all_d = data_shell.get_percentile(Scores1,'Al Data')
df_percentile = pd.concat([mus_p,non_mus_p,takaful_p,commercial_p,dev_p,undev_p,inpat_p,outpat_p,all_d],axis=0)
#df_percentile

In [None]:
data_shell.plot_hist_groups(undev_scores,dev_scores,'Undeveloped Scores','Developed Scores')

In [None]:
data_shell.plot_kde_groups(undev_scores,dev_scores,'Undeveloped Scores','Developed Scores')

In [None]:
data_shell.plot_hist_groups(muslim_scores,non_muslim_scores,'Muslim Scores','Non-Muslim Scores')

In [None]:
data_shell.plot_kde_groups(muslim_scores,non_muslim_scores,'Muslim Scores','Non-Muslim Scores')

In [None]:
data_shell.plot_hist_groups(takaful_scores,commercial_scores,'Takaful Scores','Commercial Scores')

In [None]:
data_shell.plot_kde_groups(takaful_scores,commercial_scores,'Takaful Scores','Commercial Scores')

In [None]:
data_shell.plot_hist_groups(inpat_scores,outpat_scores,'Inpatient Scores','Outpatient Scores')

In [None]:
data_shell.plot_kde_groups(inpat_scores,outpat_scores,'Inpatient Scores','Outpatient Scores')