In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import warnings     # for supressing a warning when importing large files
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from collections import Counter
from sklearn.metrics import confusion_matrix,roc_curve,accuracy_score,roc_auc_score,classification_report
import pickle
from scipy import stats
import time
from sklearn.model_selection import GridSearchCV,KFold

from pylab import rcParams

%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load Train Dataset

Train=pd.read_csv("/kaggle/input/healthcare-provider-fraud-detection-analysis/Train-1542865627584.csv")
Train_Beneficiarydata=pd.read_csv("/kaggle/input/healthcare-provider-fraud-detection-analysis/Train_Beneficiarydata-1542865627584.csv")
Train_Inpatientdata=pd.read_csv("/kaggle/input/healthcare-provider-fraud-detection-analysis/Train_Inpatientdata-1542865627584.csv")
Train_Outpatientdata=pd.read_csv("/kaggle/input/healthcare-provider-fraud-detection-analysis/Train_Outpatientdata-1542865627584.csv")

# Load Test Dataset

Test=pd.read_csv("/kaggle/input/healthcare-provider-fraud-detection-analysis/Test-1542969243754.csv")
Test_Beneficiarydata=pd.read_csv("/kaggle/input/healthcare-provider-fraud-detection-analysis/Test_Beneficiarydata-1542969243754.csv")
Test_Inpatientdata=pd.read_csv("/kaggle/input/healthcare-provider-fraud-detection-analysis/Test_Inpatientdata-1542969243754.csv")
Test_Outpatientdata=pd.read_csv("/kaggle/input/healthcare-provider-fraud-detection-analysis/Test_Outpatientdata-1542969243754.csv")

In [None]:
## Lets Check Shape of datasets 

print('Shape of Train data :',Train.shape)
print('Shape of Train_Beneficiarydata data :',Train_Beneficiarydata.shape)
print('Shape of Train_Inpatientdata data :',Train_Inpatientdata.shape)
print('Shape of Train_Outpatientdata data :',Train_Outpatientdata.shape)

print('Shape of Test data :',Test.shape)
print('Shape of Test_Beneficiarydata data :',Test_Beneficiarydata.shape)
print('Shape of Test_Inpatientdata data :',Test_Inpatientdata.shape)
print('Shape of Test_Outpatientdata data :',Test_Outpatientdata.shape)

#### Train and Test Dataset understanding

In [None]:
print('\033[1m'"Train Dataset"+ "\033[0m","\n",Train.head(4),'\n')

print('\033[1m'+"Test Dataset"+ "\033[0m")

print(Test.head(4)) # We don't have Target Variable Fraud in the test dataset and this target variable we need to predict

In [None]:
#To Check the summary of the train dataset

Train.describe()

In [None]:
## Lets check whether  providers details are unique or not in train data
print(Train.Provider.value_counts(sort=True,ascending=False).head(5))  # number of unique providers in train data.Check for duplicates

print('\n Total missing values in Train :',Train.isna().sum().sum())

print('\n Total missing values in Train :',Test.isna().sum().sum())

### Data Preprocessing on Beneficiary Dataset

In [None]:
print('\033[1m'+"Train Dataset"+ "\033[0m")

display(Train_Beneficiarydata.head(5))

print('\033[1m'+"Test Dataset"+ "\033[0m")

display(Test_Beneficiarydata.head(5))

In [None]:
#Lets Check missing values in each column in beneficiary data :


print('\033[1m'+"Train Beneficiary Dataset"+ "\033[0m")

print(Train_Beneficiarydata.isna().sum())

print('\033[1m'+"Test Beneficiary Dataset"+ "\033[0m")

print(Train_Beneficiarydata.isna().sum())

In [None]:
# Lets check data types of each column in beneficiary data

Train_Beneficiarydata.dtypes


In [None]:
Train_Beneficiarydata.describe(include='all')

In [None]:
##Replacing 2 with 0 for chronic conditions ,that means chronic condition No is 0 and yes is 1

Train_Beneficiarydata = Train_Beneficiarydata.replace({'ChronicCond_Alzheimer': 2, 'ChronicCond_Heartfailure': 2, 'ChronicCond_KidneyDisease': 2,
                           'ChronicCond_Cancer': 2, 'ChronicCond_ObstrPulmonary': 2, 'ChronicCond_Depression': 2, 
                           'ChronicCond_Diabetes': 2, 'ChronicCond_IschemicHeart': 2, 'ChronicCond_Osteoporasis': 2, 
                           'ChronicCond_rheumatoidarthritis': 2, 'ChronicCond_stroke': 2 }, 0)

Train_Beneficiarydata = Train_Beneficiarydata.replace({'RenalDiseaseIndicator': 'Y'}, 1)


## Same thing do in the Test Dataset also 
Test_Beneficiarydata = Test_Beneficiarydata.replace({'ChronicCond_Alzheimer': 2, 'ChronicCond_Heartfailure': 2, 'ChronicCond_KidneyDisease': 2,
                           'ChronicCond_Cancer': 2, 'ChronicCond_ObstrPulmonary': 2, 'ChronicCond_Depression': 2, 
                           'ChronicCond_Diabetes': 2, 'ChronicCond_IschemicHeart': 2, 'ChronicCond_Osteoporasis': 2, 
                           'ChronicCond_rheumatoidarthritis': 2, 'ChronicCond_stroke': 2 }, 0)

Test_Beneficiarydata = Test_Beneficiarydata.replace({'RenalDiseaseIndicator': 'Y'}, 1)

#### Feature Engineering on Beneficiary Dataset

In [None]:
## Lets Create Age column to the Train and Test dataset

Train_Beneficiarydata['DOB'] = pd.to_datetime(Train_Beneficiarydata['DOB'] )
Train_Beneficiarydata['DOD'] = pd.to_datetime(Train_Beneficiarydata['DOD'],errors='ignore')
Train_Beneficiarydata['Age'] = round(((Train_Beneficiarydata['DOD'] - Train_Beneficiarydata['DOB']).dt.days)/365)


Test_Beneficiarydata['DOB'] = pd.to_datetime(Test_Beneficiarydata['DOB'])
Test_Beneficiarydata['DOD'] = pd.to_datetime(Test_Beneficiarydata['DOD'],errors='ignore')
Test_Beneficiarydata['Age'] = round(((Test_Beneficiarydata['DOD'] - Test_Beneficiarydata['DOB']).dt.days)/365)

In [None]:
Train_Beneficiarydata.head(10)

In [None]:
## As we can see above Age column have some Nan values, This is due to DOD is Nan for that record.
## As we see that last DOD value is 2017-12-01 ,which means Beneficiary Details data is of year 2017.
## so we will calculate age of other benficiaries for year 2017.

Train_Beneficiarydata.Age.fillna(round(((pd.to_datetime('2017-12-01' ) - Train_Beneficiarydata['DOB']).dt.days)/365),
                                 inplace=True)


Test_Beneficiarydata.Age.fillna(round(((pd.to_datetime('2017-12-01') - Test_Beneficiarydata['DOB']).dt.days)/365),
                                 inplace=True)

In [None]:
Train_Beneficiarydata.head(5)


#### Add Flag column 'WhetherDead' using DOD values to tell whether beneficiary is dead on not

In [None]:
#Lets create a new variable 'WhetherDead' with flag 1 means Dead and 0 means not Dead

Train_Beneficiarydata.loc[Train_Beneficiarydata.DOD.isna(),'WhetherDead']=0
Train_Beneficiarydata.loc[Train_Beneficiarydata.DOD.notna(),'WhetherDead']=1



Test_Beneficiarydata.loc[Test_Beneficiarydata.DOD.isna(),'WhetherDead']=0
Test_Beneficiarydata.loc[Test_Beneficiarydata.DOD.notna(),'WhetherDead']=1


In [None]:
print('\033[1m'+"Train Dataset"+ "\033[0m")

print(Train_Beneficiarydata.loc[:,'WhetherDead'].head(7))

print('\033[1m'+"Test Dataset"+ "\033[0m")

print(Train_Beneficiarydata.loc[:,'WhetherDead'].head(7))

### Data Preprocessing on Inpatient Dataset

In [None]:
# Summary of Inpatient Dataset

print('\033[1m'+"Train Inpatient Dataset"+ "\033[0m")

display(Train_Inpatientdata.head(5))

print('\033[1m'+"Test Inpatient Dataset"+ "\033[0m")

display(Train_Inpatientdata.head(5))

In [None]:
#Lets check missing values in each column in inpatient data

print('\033[1m'+"Train Inpatient Dataset"+ "\033[0m")

print(Train_Inpatientdata.isna().sum())

print('\033[1m'+"Test Inpatient Dataset"+ "\033[0m")

print(Test_Inpatientdata.isna().sum())

#### Feature Engineering on Inpatient Dataset

Create new column 'AdmitForDays' indicating number of days patient was admitted in hospital

In [None]:
## As patient can be admitted for only for 1 day,we will add 1 to the difference of Discharge Date and Admission Date 

Train_Inpatientdata['AdmissionDt'] = pd.to_datetime(Train_Inpatientdata['AdmissionDt'])
Train_Inpatientdata['DischargeDt'] = pd.to_datetime(Train_Inpatientdata['DischargeDt'])
Train_Inpatientdata['AdmitForDays'] = ((Train_Inpatientdata['DischargeDt'] - Train_Inpatientdata['AdmissionDt']).dt.days.abs())+1


Test_Inpatientdata['AdmissionDt'] = pd.to_datetime(Test_Inpatientdata['AdmissionDt'])
Test_Inpatientdata['DischargeDt'] = pd.to_datetime(Test_Inpatientdata['DischargeDt'])
Test_Inpatientdata['AdmitForDays'] = ((Test_Inpatientdata['DischargeDt'] - Test_Inpatientdata['AdmissionDt']).dt.days.abs())+1

In [None]:
Train_Inpatientdata.loc[:,['AdmissionDt','DischargeDt','AdmitForDays']]

In [None]:
## Lets check Min and Max values of AdmitforDays column in Train and Test.
print('Min AdmitForDays Train:- ',Train_Inpatientdata.AdmitForDays.min())
print('Max AdmitForDays Train:- ',Train_Inpatientdata.AdmitForDays.max())
print(Train_Inpatientdata.AdmitForDays.isnull().sum() )  #Check Null values.

print('Min AdmitForDays Test:- ',Test_Inpatientdata.AdmitForDays.min())
print('Max AdmitForDays Test:- ',Test_Inpatientdata.AdmitForDays.max())
print(Test_Inpatientdata.AdmitForDays.isnull().sum())   #Check Null values.

### Data Preprocessing on Outpatient Dataset

In [None]:
# Summary of Outpatient Dataset

print('\033[1m'+"Train Outpatient Dataset"+ "\033[0m")

display(Train_Outpatientdata.head(5))

print('\033[1m'+"Test Outpatient Dataset"+ "\033[0m")

display(Train_Outpatientdata.head(5))

In [None]:
# Lets check the null values in each column of Outpatient Dataset

print('\033[1m'+"Train Outpatient Dataset"+ "\033[0m")

print(Train_Outpatientdata.isna().sum())

print('\033[1m'+"Test Outpatient Dataset"+ "\033[0m")

print(Test_Outpatientdata.isna().sum())

In [None]:
## Lets Check Shape of datasets after adding new variables

print('Shape of Train data :',Train.shape)
print('Shape of Train_Beneficiarydata data :',Train_Beneficiarydata.shape)
print('Shape of Train_Inpatientdata data :',Train_Inpatientdata.shape)
print('Shape of Train_Outpatientdata data :',Train_Outpatientdata.shape)

print('Shape of Test data :',Test.shape)
print('Shape of Test_Beneficiarydata data :',Test_Beneficiarydata.shape)
print('Shape of Test_Inpatientdata data :',Test_Inpatientdata.shape)

### Merge Beneficiary, Inpatient and Outpatient Dataset into a single dataset 


#### Merging of Train Datasets 

In [None]:

Train_patient_merge_id = [i for i in Train_Outpatientdata.columns if i in Train_Inpatientdata.columns]

# Merge Inpatient, Outpatient and beneficiary dataframe into a single patient dataset
Train_Patient_data = pd.merge(Train_Inpatientdata, Train_Outpatientdata,
                    left_on = Train_patient_merge_id,
                    right_on = Train_patient_merge_id,
                    how = 'outer').\
          merge(Train_Beneficiarydata,left_on='BeneID',right_on='BeneID',how='inner')

#### Merging of Test Dataset

In [None]:
Test_patient_merge_id = [i for i in Test_Outpatientdata.columns if i in Test_Inpatientdata.columns]

# Merge Inpatient, Outpatient and beneficiary dataframe into a single patient dataset
Test_Patient_data = pd.merge(Test_Inpatientdata, Test_Outpatientdata,
                    left_on = Test_patient_merge_id,
                    right_on = Test_patient_merge_id,
                    how = 'outer').\
          merge(Test_Beneficiarydata,left_on='BeneID',right_on='BeneID',how='inner')

In [None]:
# Shape of Merging Dataset 

print("Train Dataset Shape after merge:",Train_Patient_data.shape)

print("Test Dataset Shape after merge:",Test_Patient_data.shape)


### Exploratory Data Analysis on Train_Patient_data dataset

In [None]:
Train_Patient_data.info()

#### Handling Missing values 

In [None]:
# To check the number of missing values in the Train_Pateint_data

Train_Patient_data.isnull().sum()

In [None]:
### There are missing values in AttendingPhysician, OperatingPhysician and OtherPhysician columns, so we need to handle these varaibles 

Train_Patient_data[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']]



In [None]:
Train_Patient_data[['AttendingPhysician','OperatingPhysician', 'OtherPhysician']].describe()

In [None]:
## We are replacing these columns value with 0 and 1 where we have value we are replacing it with 1 and in place of null value we replace it with 0.


Train_Patient_data[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']] = np.where(Train_Patient_data[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']].isnull(), 0, 1)

In [None]:
Train_Patient_data[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']]

In [None]:
### Add a new variable in which it tells us how many total types of physicians used for the particular claim or patient.


Train_Patient_data['N_Types_Physicians'] = Train_Patient_data['AttendingPhysician'] +  Train_Patient_data['OperatingPhysician'] + Train_Patient_data['OtherPhysician']

Train_Patient_data['N_Types_Physicians']

In [None]:
Train_Patient_data.isnull().sum() #We can see here new variable "N_Type_Physicians" is added

In [None]:
### Handling Missing values on"DiagnosisGroupCode"

Train_Patient_data['DiagnosisGroupCode'].describe()

In [None]:
# Here we are finding out each DignosisGroupCode Count

Count_DiagnosisGroupCode=Train_Patient_data['DiagnosisGroupCode'].value_counts()
Count_DiagnosisGroupCode=Count_DiagnosisGroupCode[:20] # To show only top 20 codes 
Count_DiagnosisGroupCode

In [None]:
### Visualization of top 20 DignosisGroupCode

fig=plt.figure(figsize=(20,8))
sns.barplot(Count_DiagnosisGroupCode.index,Count_DiagnosisGroupCode.values)
fig.tight_layout()

## From here we can see that DignosisGroupCode 882 has maximum count that is 179 

In [None]:
## Since in this columns we have maximum values as null, so we are handling this by creating a new column
## so we are creating a new variable/column "IsDiagnosisCode" in which value will either "1" or "0" 
## if in a claim there is a groupDiagnosiscode  has null value then in "IsDiagnosisCode" column value is 0 otherwise 1

Train_Patient_data['IsDiagnosisCode'] = np.where(Train_Patient_data.DiagnosisGroupCode.notnull(), 1, 0)
Train_Patient_data = Train_Patient_data.drop(['DiagnosisGroupCode'], axis = 1) # We are droping the column "DiagnosisGroupCode"



In [None]:
Train_Patient_data['IsDiagnosisCode']

In [None]:
### Handling missing values for "DeductibleamtPaid" column

Train_Patient_data['DeductibleAmtPaid'].isnull().sum()  #Check number of missing values in this variable

In [None]:
# Describing this column by omiting the Nan, to check mean , variance , skewness etc

sc.stats.describe(Train_Patient_data['DeductibleAmtPaid'],nan_policy='omit')

In [None]:
## Count Plot of "DeductibleAmtPaid" maximum values are 0 in this 

fig=plt.figure(figsize=(15,10))
sns.countplot(Train_Patient_data['DeductibleAmtPaid'])


In [None]:
## Box plot of this "DeductibleAmtPaid", maximum values are 0 that shows here.

fig=plt.figure(figsize=(8,6))
sns.boxplot(Train_Patient_data['DeductibleAmtPaid'])
fig.tight_layout()

In [None]:
## So from the above analysis we can reach to the conclusion that we replace missing values with 0 

Train_Patient_data['DeductibleAmtPaid'].fillna(0,inplace=True)

In [None]:
### We are also creating one new variable "IsDeductibleAmtPaid" which tells us that particular claim has any DeductibleAmtPaid or not

Train_Patient_data['IsDeductibleAmtPaid']=np.where(Train_Patient_data['DeductibleAmtPaid']==0,0,1) 

In [None]:
# So from this plot we can say that maximum claims doesn't have any "DeductibleAmtPaid"

fig=plt.figure(figsize=(8,6))
sns.countplot(Train_Patient_data['IsDeductibleAmtPaid'])

print(Train_Patient_data['IsDeductibleAmtPaid'].value_counts())

In [None]:
### Handling missing values for "AdmitForDays" column

Train_Patient_data['AdmitForDays'].isnull().sum() # Count of missing values in this column


In [None]:
# Replace all value with 0 as these all are the patients that didn't admit in the hospital

Train_Patient_data['AdmitForDays'].fillna(0,inplace=True)


In [None]:
Train_Patient_data['AdmitForDays'].isnull().sum()

In [None]:
#In this dataset now we have some Date columns in which missing values are there, which we do not need to handle and we can drop those columns also. 

Train_Patient_data.isnull().sum() 


# Now we need to handle missing values of ClmDiagnosisCodes and ClmProcedureCode columns 

In [None]:
## First we handle ClmProcedureCodes variables 

ClmProcedure_vars = ['ClmProcedureCode_{}'.format(x) for x in range(1,7)]
ClmProcedure_vars

In [None]:
Train_Patient_data[ClmProcedure_vars]

In [None]:
## To Check how many null values are in each Clmprocedurecodes
## By this we find out that in code_6 column all are Nan values 

Train_Patient_data[ClmProcedure_vars].isnull().sum()

In [None]:
Train_Patient_data[ClmProcedure_vars].describe()

In [None]:
# This function helps us find the length of unique values in each row/record
def N_unique_values(df):
    return np.array([len(set([i for i in x[~pd.isnull(x)]])) for x in df.values])

In [None]:
# We count the number of procedureCode for each claim and store these value in a new variable
Train_Patient_data['N_Procedure'] = N_unique_values(Train_Patient_data[ClmProcedure_vars])

## So from here we get to know that 534901 claims/records has 0 claim procedure codes, 17820 claims/records has 1 claimprocedurecodes and so on

Train_Patient_data['N_Procedure'].value_counts()

In [None]:
### Handling of 'ClmDiagnosisCode'

# We count the number of claims
ClmDiagnosisCode_vars =['ClmAdmitDiagnosisCode'] + ['ClmDiagnosisCode_{}'.format(x) for x in range(1, 11)]


ClmDiagnosisCode_vars

In [None]:
# We count the number of CLMDiagnosisCode for each claim and store these value in a new variable

Train_Patient_data['N_UniqueDiagnosis_Claims'] = N_unique_values(Train_Patient_data[ClmDiagnosisCode_vars])


Train_Patient_data['N_UniqueDiagnosis_Claims'].value_counts()

#### EDA on other remaining variables 

#### 1.Gender

In [None]:
Train_Patient_data.Gender.describe()  

In [None]:
Train_Patient_data.Gender.value_counts() # here we have only 1 and 2, so we can change it to binary as 0 or 1 

In [None]:
Train_Patient_data['Gender']=Train_Patient_data['Gender'].replace(2,0) # replacing 2 with 0 



## Countplot of Gender Column, Here we can consider 0 as Female and 1 as Male

fig=plt.figure(figsize=(8,6))
sns.countplot(Train_Patient_data['Gender'])
fig.tight_layout()

#### 2.Race

In [None]:
Train_Patient_data['Race'].describe()

In [None]:
### Countplot of Race variable 
### From here we can find out that majority of claims are from Race 1
fig=plt.figure(figsize=(8,6))
sns.countplot(Train_Patient_data['Race'])
fig.tight_layout()

In [None]:
### Now in Race column we do 'one hot encoding' so that ranking of values doesn't occur here 

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
x = onehotencoder.fit_transform(Train_Patient_data.Race.values.reshape(-1, 1)).toarray()

In [None]:
df_OneHot = pd.DataFrame(x, columns = ["Race_"+str(int(i)) for i in range(1,5)]) 
df_OneHot

In [None]:
df_OneHot.drop('Race_1',axis=1,inplace=True) ## Drop the first column "Race_1" this we need to drop when we do oneHotEncoding
df_OneHot

In [None]:
## Concatenation of dataframe "df_oneHot" that we created above in our main dataset

Train_Patient_data = pd.concat([Train_Patient_data, df_OneHot], axis=1)


Train_Patient_data.drop(['Race'], axis=1,inplace=True)  #So now we do not need this race column so we are droping this also 

#### 3. RenealDiseaseIndicator

In [None]:
Train_Patient_data['RenalDiseaseIndicator'].describe()

In [None]:
## Countplot of "RenalDiseaseIndicator" variable from here we can findout that maximu disease doesn't have any RenalDisease
fig=plt.figure(figsize=(8,6))
sns.countplot(Train_Patient_data['RenalDiseaseIndicator'])
fig.tight_layout()

In [None]:
Train_Patient_data['RenalDiseaseIndicator']=Train_Patient_data.RenalDiseaseIndicator.astype(int) # Change of datatype from object to int


Train_Patient_data['RenalDiseaseIndicator'].describe()

#### 4. State and County

In [None]:
Train_Patient_data[['State','County']].describe()

In [None]:
#Find out which state has maximum count of claims

state_count=Train_Patient_data['State'].value_counts()
state_count=state_count[:20]
state_count

In [None]:
##Count plot of top 20 states which have maximum claims  

## from here we can see that state code 5 has maximum number of claims 

fig=plt.figure(figsize=(10,6))
sns.barplot(state_count.index,state_count.values,order=state_count.index)
fig.tight_layout()

In [None]:
#Find out which County has maximum count of claims
county_count=Train_Patient_data['County'].value_counts()
county_count=county_count[:20]
county_count

In [None]:
##Count plot of top 20 County which have maximum claims  

## from here we can see that County code 200 has maximum number of claims 


fig=plt.figure(figsize=(12,6))
sns.barplot(county_count.index,county_count.values,order=county_count.index)
fig.tight_layout()

In [None]:
#### 5. Chronic_cond

In [None]:
## Visulization of ChronicCond Variables 

## From this we can findout that how many claims has ChronicCond diseases, for eg: In ChronicCond_Alzheimer more than 3 lacs claims doesn't have this and remaining claims approx( 2 lacs) have ChronicCond_Alzheimer

fig=plt.figure(figsize=(20,20))

for col in range(1,12):
    plt.subplot(6,2,col)
    sns.countplot(Train_Patient_data.iloc[:,37+col])
    
fig.tight_layout()

#### Boxplots of some numerical features to check the distribution of data 

In [None]:
## Boxplot of "IPAnnualReimbursementAmt" and we can see in this boxplot data is not normally distributed and it is left skewed 

fig=plt.figure(figsize=(8,6))
sns.boxplot(Train_Patient_data['IPAnnualReimbursementAmt'])
fig.tight_layout()

In [None]:

## Boxplot of "IPAnnualDeductibleAmt" and we can see in this boxplot data is not normally distributed and it is left skewed


fig=plt.figure(figsize=(8,6))
sns.boxplot(Train_Patient_data['IPAnnualDeductibleAmt'])
fig.tight_layout()

### Handling  Missing values and add new features in Test_Patient_data

In [None]:
Test_Patient_data.isnull().sum()

In [None]:
## We are replacing these columns value with 0 and 1 where we have value we are replacing it with 1 and in place of null value we replace it with 0.


Test_Patient_data[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']] = np.where(Test_Patient_data[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']].isnull(), 0, 1)

Test_Patient_data['N_Types_Physicians'] = Test_Patient_data['AttendingPhysician'] +  Test_Patient_data['OperatingPhysician'] + Test_Patient_data['OtherPhysician']

In [None]:
Test_Patient_data['IsDiagnosisCode'] = np.where(Test_Patient_data.DiagnosisGroupCode.notnull(), 1, 0)
Test_Patient_data = Test_Patient_data.drop(['DiagnosisGroupCode'], axis = 1)

In [None]:
Test_Patient_data.isnull().sum()

In [None]:
Test_Patient_data['DeductibleAmtPaid'].describe()

In [None]:
Test_Patient_data['DeductibleAmtPaid'].fillna(0,inplace=True)

Test_Patient_data['IsDeductibleAmtPaid']=np.where(Test_Patient_data['DeductibleAmtPaid']==0,0,1) 


Test_Patient_data['IsDeductibleAmtPaid'].value_counts()

In [None]:
Test_Patient_data['AdmitForDays'].isnull().sum()

In [None]:
Test_Patient_data['AdmitForDays'].fillna(0,inplace=True)



In [None]:
Test_Patient_data.Gender.describe()

In [None]:
Test_Patient_data['Gender']=Test_Patient_data['Gender'].replace(2,0)



In [None]:
Test_Patient_data['Race'].describe()

In [None]:
onehotencoder = OneHotEncoder()
x = onehotencoder.fit_transform(Test_Patient_data.Race.values.reshape(-1, 1)).toarray()

df_test_OneHot = pd.DataFrame(x, columns = ["Race_"+str(int(i)) for i in range(1,5)]) 
df_test_OneHot

In [None]:
df_test_OneHot.drop('Race_1',axis=1,inplace=True)


Test_Patient_data = pd.concat([Test_Patient_data, df_test_OneHot], axis=1)

#droping the country column 


In [None]:
Test_Patient_data.drop(['Race'], axis=1,inplace=True) 

In [None]:
Test_Patient_data.info()

In [None]:
Test_Patient_data['RenalDiseaseIndicator'].describe()

In [None]:
Test_Patient_data['RenalDiseaseIndicator']=Test_Patient_data.RenalDiseaseIndicator.astype(int)

In [None]:
Test_Patient_data[ClmProcedure_vars].describe()

In [None]:
# We count the number of procedures for each claim
Test_Patient_data['N_Procedure'] = N_unique_values(Test_Patient_data[ClmProcedure_vars])



In [None]:
Test_Patient_data['N_Procedure'].value_counts()

In [None]:
# We count the number of CLMDiagnosisCode for each claim and store these value in a new variable

Test_Patient_data['N_UniqueDiagnosis_Claims'] = N_unique_values(Test_Patient_data[ClmDiagnosisCode_vars])

Test_Patient_data['N_UniqueDiagnosis_Claims'].value_counts()

In [None]:
print('\033[1m'+"Train Patient Dataset"+ "\033[0m")

print(Train_Patient_data.info())

print('\033[1m'+"Test Patient Dataset"+ "\033[0m")

print(Test_Patient_data.info())

## Merging of Train and Test dataframe with Train_Patient_data and Test_Patient_data respectively to create a Final Dataframe for Train and Test for modelling  

In [None]:
### Count number of records
## From here we get the count of BeneID and ClaimId for each provider

## For Train 
Train_Count = Train_Patient_data[['BeneID', 'ClaimID']].groupby(Train_Patient_data['Provider']).nunique().reset_index()
Train_Count.rename(columns={'BeneID':'BeneID_count','ClaimID':'ClaimID_count'},inplace=True)


## For Test
Test_Count = Test_Patient_data[['BeneID', 'ClaimID']].groupby(Test_Patient_data['Provider']).nunique().reset_index()
Test_Count.rename(columns={'BeneID':'BeneID_count','ClaimID':'ClaimID_count'},inplace=True)

In [None]:
Train_Count

In [None]:
Test_Count

In [None]:
Train_Data_Sum = Train_Patient_data.groupby(['Provider'], as_index = False)[['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'RenalDiseaseIndicator', 
                                                     'AttendingPhysician','OperatingPhysician','OtherPhysician','AdmitForDays',
                                                    'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure','ChronicCond_Cancer', 
                                                    'ChronicCond_KidneyDisease', 'ChronicCond_ObstrPulmonary',
                                                   'ChronicCond_Depression','ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',   
                                                    'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
                                                    'ChronicCond_stroke', 'IPAnnualReimbursementAmt','IPAnnualDeductibleAmt',
                                                    'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'WhetherDead',
                                                    'N_Types_Physicians','IsDiagnosisCode', 'N_Procedure', 'N_UniqueDiagnosis_Claims']].sum()

Test_Data_Sum = Test_Patient_data.groupby(['Provider'], as_index = False)[['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'RenalDiseaseIndicator', 
                                                     'AttendingPhysician','OperatingPhysician','OtherPhysician','AdmitForDays',
                                                    'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure','ChronicCond_Cancer', 
                                                    'ChronicCond_KidneyDisease', 'ChronicCond_ObstrPulmonary',
                                                   'ChronicCond_Depression','ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',   
                                                    'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
                                                    'ChronicCond_stroke', 'IPAnnualReimbursementAmt','IPAnnualDeductibleAmt',
                                                    'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'WhetherDead',
                                                    'N_Types_Physicians','IsDiagnosisCode', 'N_Procedure', 'N_UniqueDiagnosis_Claims']].sum()

In [None]:
Train_Data_Sum

In [None]:
Test_Data_Sum

In [None]:
## Here we are calculating the mean of values for some variables for each unique provider.

Train_Data_Mean=round(Train_Patient_data.groupby(['Provider'], as_index = False)[['NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
                                                                            'Age']].mean())


Test_Data_Mean=round(Test_Patient_data.groupby(['Provider'], as_index = False)[['NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
                                                                            'Age']].mean())


In [None]:
Train_Data_Mean

In [None]:
#### Now we merge Count,sum and mean dataframes with the main train dataframe

In [None]:
## Merging of Train Datasets
Train_df=pd.merge(Train_Count,Train_Data_Sum,on='Provider',how='left').\
                merge(Train_Data_Mean,on='Provider',how='left').\
                merge(Train,on='Provider',how='left')

## Merging of Test Datasets

Test_df=pd.merge(Test_Count,Test_Data_Sum,on='Provider',how='left').\
                merge(Test_Data_Mean,on='Provider',how='left').\
                merge(Test,on='Provider',how='left')


In [None]:
Train_df #Target column PotentialFraud is avaialble here

In [None]:
Test_df #Target column PotentialFraud is not avaialble here

In [None]:
Train_df.isnull().sum() ## No null value is present in this dataset 

In [None]:
#In Train Dataset Target variable PotentialFraud has value in category i.e "Yes" and "No" need to replace with 1 and 0.

Train_df['PotentialFraud']=np.where(Train_df.PotentialFraud == "Yes", 1, 0)

In [None]:
Train_df

In [None]:
# Here we can the count of Dependent variable values 
plt.figure(figsize=(10,8))
sns.countplot(Train_df.PotentialFraud)

###  Bivariant Data Analysis

In [None]:
## Here we can se the barplot of PotentialFraud v/s BeneID_Count and here bar shows mean of BeneID_Count for Potential Fraud value 1 and 0
## From this barplot we can conclude that there is a Potential Fraud when the BeneID_Count is more as its mean is more as shown.

plt.figure(figsize=(12,8))
sns.barplot(Train_df["PotentialFraud"],Train_df["BeneID_count"], hue=Train_df["PotentialFraud"])
plt.suptitle('PotentialFraud v/s BeneID_count')
plt.xlabel('PotentialFraud')
plt.ylabel('BeneID_count')

As we can see Fraudulant claims have higher number of Beneficiary ID as they tend to commit fraud with multiple beneficiary id.

In [None]:
## Here we can se the barplot of PotentialFraud v/s ClaimID_Count and here bar shows mean of ClaimID_Count for Potential Fraud value 1 and 0
## From this barplot we can conclude that there is a Potential Fraud when the ClaimID_Count is more as its mean is more as shown.

plt.figure(figsize=(12,8))
sns.barplot(Train_df["PotentialFraud"],Train_df["ClaimID_count"], hue=Train_df["PotentialFraud"])

Same as the above observation, potential fraud claims tend to have higher number of Claim ID.

In [None]:
## Here we can se the barplot of PotentialFraud v/s InscClaimAmtReimbursed and here bar shows mean of InscClaimAmtReimbursed for Potential Fraud value 1 and 0
## From this barplot we can conclude that there is a Potential Fraud when the InscClaimAmtReimbursed is more as its mean is more as shown.

plt.figure(figsize=(12,8))

sns.barplot(Train_df["PotentialFraud"],Train_df["InscClaimAmtReimbursed"], hue=Train_df["PotentialFraud"])

In [None]:
plt.figure(figsize=(12,8))

sns.barplot(Train_df["PotentialFraud"],Train_df["DeductibleAmtPaid"], hue=Train_df["PotentialFraud"])

As we have observed both in InscClaimAmtReimbursed and DeductibleAmtPaid are way higher than the legitimate claims.

In [None]:

plt.figure(figsize=(12,8))
sns.barplot(Train_df["PotentialFraud"],Train_df["RenalDiseaseIndicator"], hue=Train_df["PotentialFraud"])

In [None]:

plt.figure(figsize=(12,8))
sns.barplot(Train_df["PotentialFraud"],Train_df["AdmitForDays"], hue=Train_df["PotentialFraud"])

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(Train_df["PotentialFraud"],Train_df["WhetherDead"], hue=Train_df["PotentialFraud"])

In category 0, the bar is between 0 and 1 because there are some people who are dead and some are alive, but in category 1 the bar has gone above 3 that means fraudulant claims are more likely to happen where people are dead.

### Correlation Matrix

In [None]:
plt.figure(figsize=(12,8))
Train_corr=Train_df.corr()
sns.heatmap(Train_corr)

In [None]:
Train_corr=Train_df.corr()
Train_corr['PotentialFraud']

So from here we can see that Age, NoOfMonths_PartBCov and NoOfMonths_PartACov are not making any pattern/relationship with dependent variable 'PotentialFraud', hence we will not consider these variables in our model 

We will make a final dataset on which we will do modelling,In this dataset we keep only those variable which we will use in our machine learning modelling algorithms. So from our Train_df dataset we will remove all ID type variables like Provider,BeneID_count and ClaimID_count and also remove those variable which are not making any pattern with the dependent variable this we can see correlation matrix that is shown above 

In [None]:
df_clf=Train_df.iloc[:,3:]
df_clf

In [None]:
df_clf.drop(['NoOfMonths_PartACov','NoOfMonths_PartBCov','Age'],axis=1,inplace=True)

### Final Train Dataset on which we trained our model 

In [None]:
df_clf #This is final Trained Dataset

### Final Test Dataset on which we will do final Prediction 

In [None]:
Test_df

In [None]:
def test(test_data):
    test_data=test_data.iloc[:,3:]
    test_data=test_data.drop(['NoOfMonths_PartACov','NoOfMonths_PartBCov','Age'],axis=1)
    return test_data

In [None]:
Test_data=test(Test_df)
Test_data ## In this target varaible is not there we need to predict this after we trained our model 

### Working on our Train Dataset 

In [None]:
#Split the dataset into Independent and Dependent Features

x=df_clf.drop("PotentialFraud",axis=1)
y=df_clf.PotentialFraud

In [None]:
print("Independent Variable shape:",x.shape)
print("Dependent Variable shape:",y.shape)

In [None]:
### Train Test Split

x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=42)


print("Independent variables train:",x_train.shape)
print("Target variable train:",y_train.shape)
print("Independent variables test:",x_test.shape)
print("Target variables test:",y_test.shape)

## Modelling 

In [None]:
#Here we can see that our target vairable is imbalanced as "0" class is is majority and "1" class is in minority 
plt.figure(figsize=(10,8))
sns.countplot(y_train)

Since our target variable is imbalanced hence we need to do the modelling using sampling techniques 

### Sampling Techniques 

#### 1. Under Sampling

In [None]:
from collections import Counter
from imblearn.under_sampling import NearMiss



ns=NearMiss(0.8)
x_train_ns,y_train_ns=ns.fit_sample(x_train,y_train) ## Create new train dataset after fitting undersmapling
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(y_train_ns)

In [None]:
print("Shape of independent trained dataset after under sampling",x_train_ns.shape)
print("Shape of dependent trained dataset after under sampling", y_train_ns.shape)

### Modelling on Under sampled Dataset  

### Random Forest on Undersampled Data

In [None]:
from sklearn.ensemble import RandomForestClassifier


clf=RandomForestClassifier()
clf_fit=clf.fit(x_train_ns,y_train_ns)

y_pred_rf=clf_fit.predict(x_test)

In [None]:

print('\033[1m'+"Confusion Matrix \n"+'\033[0m',confusion_matrix(y_test,y_pred_rf))
print('\033[1m'+"\n Accuracy Score \n"+'\033[0m',accuracy_score(y_test,y_pred_rf))
print('\033[1m'+"\n Classification Report \n"+'\033[0m',classification_report(y_test,y_pred_rf))

Here we can see that this model accuracy is low or not good.This model is able to classify "1" class but not able to classify "0" class 

### SVM on Undersampled Data 

In [None]:
from sklearn.svm import SVC


clf_svc=SVC()
clf_svc_fit=clf_svc.fit(x_train_ns,y_train_ns)

y_pred_svc=clf_svc_fit.predict(x_test)

In [None]:

print('\033[1m'+"Confusion Matrix \n"+'\033[0m',confusion_matrix(y_test,y_pred_svc))
print('\033[1m'+"\n Accuracy Score \n"+'\033[0m',accuracy_score(y_test,y_pred_svc))
print('\033[1m'+"\n Classification Report \n"+'\033[0m',classification_report(y_test,y_pred_svc))

In SVM also accuracy score is okay, but here are many miscalssification in "0" class, Hence we go for another sampling technique 


### 2. Over Sampling

In [None]:
from imblearn.over_sampling import RandomOverSampler


os=RandomOverSampler(0.75)
x_train_os,y_train_os=os.fit_sample(x_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_os)))

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(y_train_os)

In [None]:
print("Shape of independent trained dataset after Over sampling",x_train_os.shape)
print("Shape of dependent trained dataset after Over sampling", y_train_os.shape)

### Random Forest on Over Sampled Dataset 

In [None]:
from sklearn.ensemble import RandomForestClassifier


os_clf=RandomForestClassifier()
os_clf_fit=os_clf.fit(x_train_os,y_train_os)

y_pred_rf_os=os_clf_fit.predict(x_test)

In [None]:

print('\033[1m'+"Confusion Matrix \n"+'\033[0m',confusion_matrix(y_test,y_pred_rf_os))
print('\033[1m'+"\n Accuracy Score \n"+'\033[0m',accuracy_score(y_test,y_pred_rf_os))
print('\033[1m'+"\n Classification Report \n"+'\033[0m',classification_report(y_test,y_pred_rf_os))

Here Random Forest gives us good accuracy score but it is not able to classify majority of "1" class correctly  

### SVM on Over Sampled Data 

In [None]:
from sklearn.svm import SVC


clf_svc_os=SVC()
clf_svc_fit_os=clf_svc_os.fit(x_train_os,y_train_os)

y_pred_svc_os=clf_svc_fit_os.predict(x_test)

In [None]:

print('\033[1m'+"Confusion Matrix \n"+'\033[0m',confusion_matrix(y_test,y_pred_svc_os))
print('\033[1m'+"\n Accuracy Score \n"+'\033[0m',accuracy_score(y_test,y_pred_svc_os))
print('\033[1m'+"\n Classification Report \n"+'\033[0m',classification_report(y_test,y_pred_svc_os))

Here SVM gives us good accuracy score but there are some misclassification in "1" class, Now we want to minimize this misclassification, Hence we go to another sampling technique 

### 3. Synthetic Minority Oversampling Technique (SMOTE) 

In [None]:
from imblearn.combine import SMOTETomek



os=SMOTETomek(0.75)
x_train_st,y_train_st=os.fit_sample(x_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_st)))

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(y_train_st)

In [None]:
print("Shape of independent trained dataset after SMOTE sampling",x_train_st.shape)
print("Shape of dependent trained dataset after SMOTE sampling", y_train_st.shape)

### Decision Tree on SMOTE Sampled Data 

In [None]:
from sklearn.tree import DecisionTreeClassifier


clf_dt=DecisionTreeClassifier()
model_dt=clf_dt.fit(x_train_st,y_train_st)
y_pred_dt=model_dt.predict(x_test)

In [None]:

print('\033[1m'+"Confusion Matrix \n"+'\033[0m',confusion_matrix(y_test,y_pred_dt))
print('\033[1m'+"\n Accuracy Score \n"+'\033[0m',accuracy_score(y_test,y_pred_dt))
print('\033[1m'+"\n Classification Report \n"+'\033[0m',classification_report(y_test,y_pred_dt))

In Decision Tree model accuracy is good but there are many misclassification for both the classes.

### Naive bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB


clf_nb=GaussianNB()
model_nb=clf_nb.fit(x_train_st,y_train_st)
y_pred_nb=model_nb.predict(x_test)


In [None]:

print('\033[1m'+"Confusion Matrix \n"+'\033[0m',confusion_matrix(y_test,y_pred_nb))
print('\033[1m'+"\n Accuracy Score \n"+'\033[0m',accuracy_score(y_test,y_pred_nb))
print('\033[1m'+"\n Classification Report \n"+'\033[0m',classification_report(y_test,y_pred_nb))


### Gradient Boosting Classifier 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf_GB=GradientBoostingClassifier()
model_GB=clf_GB.fit(x_train_st,y_train_st)
y_pred_GB=model_GB.predict(x_test)

In [None]:

print('\033[1m'+"Confusion Matrix \n"+'\033[0m',confusion_matrix(y_test,y_pred_GB))
print('\033[1m'+"\n Accuracy Score \n"+'\033[0m',accuracy_score(y_test,y_pred_GB))
print('\033[1m'+"\n Classification Report \n"+'\033[0m',classification_report(y_test,y_pred_GB))


### Random Forest on SMOTE sampled Data 

In [None]:
clf_st=RandomForestClassifier()
model_rf=clf_st.fit(x_train_st,y_train_st)
y_pred_rf=model_rf.predict(x_test)

In [None]:

print('\033[1m'+"Confusion Matrix \n"+'\033[0m',confusion_matrix(y_test,y_pred_rf))
print('\033[1m'+"\n Accuracy Score \n"+'\033[0m',accuracy_score(y_test,y_pred_rf))
print('\033[1m'+"\n Classification Report \n"+'\033[0m',classification_report(y_test,y_pred_rf))

### ROC Curve

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [None]:
probs = model_rf.predict_proba(x_test)
probs = probs[:, 1]
probs

In [None]:
fpr1, tpr1, thresholds = roc_curve(y_test, probs)

plot_roc_curve(fpr1, tpr1)


print('\033[1m'+"AUC Score \n"+'\033[0m', roc_auc_score(y_test, probs))

### SVM on SMOTE Sampled Data 

In [None]:
clf_svm=SVC(probability=True)
model_svm=clf_svm.fit(x_train_st,y_train_st)
y_pred_svm=model_svm.predict(x_test)

In [None]:

print('\033[1m'+"Confusion Matrix \n"+'\033[0m',confusion_matrix(y_test,y_pred_svm))
print('\033[1m'+"\n Accuracy Score \n"+'\033[0m',accuracy_score(y_test,y_pred_svm))
print('\033[1m'+"\n Classification Report \n"+'\033[0m',classification_report(y_test,y_pred_svm))


SVM is working good here as we can see misclassification for classes is also less 

#### ROC Curve for SVM 

In [None]:
fpr2, tpr2, thresholds = roc_curve(y_test, probs_svm)

plot_roc_curve(fpr2, tpr2)


print('\033[1m'+"AUC Score \n"+'\033[0m', round(roc_auc_score(y_test, probs_svm),2))

### Comparison of Models Accuracy 

In [None]:
rf_accuracy=round(accuracy_score(y_test,y_pred_rf),4)
svm_accuracy=round(accuracy_score(y_test,y_pred_svm),4)
GB_accuracy=round(accuracy_score(y_test,y_pred_GB),4)
DT_accuracy=round(accuracy_score(y_test,y_pred_dt),4)
NB_accuracy=round(accuracy_score(y_test,y_pred_nb),4)

In [None]:
Accuracy=pd.DataFrame({"Model":["Decision Tree","Naive Bayes","Random Forest","SVM","Gradient Boosting"],"Accuracy":[DT_accuracy,NB_accuracy,rf_accuracy,svm_accuracy,GB_accuracy]})
Accuracy

In [None]:
sns.barplot(x=Accuracy.Model,y=Accuracy.Accuracy,)

### Comparison of ROC Curve between RandomForest and SVM 

In [None]:
plt.plot(fpr2, tpr2,color='orange',label='SVM')
plt.plot(fpr1, tpr1,color='green',label='Random Forest')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.legend()
plt.show();

### Hence our final model will be SVM.

### Prediction of Potential Fraud (Target Variable) on our main Test Data 

In [None]:
Test_data.head()

In [None]:
PotentialFraud=model_svm.predict(Test_data)

Potential_Fraud=pd.DataFrame(PotentialFraud,columns=['PotentialFraud'])

In [None]:
Predicted_Test_data=pd.concat([Test_df,Potential_Fraud],axis=1)

Predicted_Test_data

## Feature Importance 

#### Get important features from RandomForest Model

In [None]:
# Get numerical feature importances from Random Forest model
importances = list(model_rf.feature_importances_)
print(importances)

feature_list=list(df_clf.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
#print(feature_importances)
# Print out the feature and importances 
print([print('Variable: {:20} Importance: {} '.format(*pair))  for pair in feature_importances])

#### Get important Features from Gradient Boosting Model

In [None]:
# Get numerical feature importances from Gradient Boosting model
importances = list(model_GB.feature_importances_)
print(importances)

In [None]:
feature_list=list(df_clf.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
#print(feature_importances)
# Print out the feature and importances 
print([print('Variable: {:20} Importance: {} '.format(*pair))  for pair in feature_importances])

In [None]:
from xgboost import XGBClassifier


xg_model=XGBClassifier()
sg_model_fit=xg_model.fit(x_train_st,y_train_st)


In [None]:
from xgboost import plot_importance
plot_importance(sg_model_fit)

In [None]:
#### Hence from above we can conclude that important features are :

x_train_imp=x_train_st[['InscClaimAmtReimbursed','AdmitForDays','DeductibleAmtPaid','N_Procedure','IsDiagnosisCode']]
x_test_imp=x_test[['InscClaimAmtReimbursed','AdmitForDays','DeductibleAmtPaid','N_Procedure','IsDiagnosisCode']]
x_train_imp.head()

#### Now we will do the modelling with important features only

In [None]:
## SVM model

svm_imp=SVC(probability=True)
svm_imp=svm_imp.fit(x_train_imp,y_train_st)
y_pred_svm2=svm_imp.predict(x_test_imp)

In [None]:

print('\033[1m'+"Confusion Matrix \n"+'\033[0m',confusion_matrix(y_test,y_pred_svm2))
print('\033[1m'+"\n Accuracy Score \n"+'\033[0m',accuracy_score(y_test,y_pred_svm2))
print('\033[1m'+"\n Classification Report \n"+'\033[0m',classification_report(y_test,y_pred_svm2))

#### ROC Curve of SVM Model on Important Features 

In [None]:
probs_new = svm_imp.predict_proba(x_test_imp)
probs_new = probs_new[:, 1]


fpr, tpr, thresholds = roc_curve(y_test, probs_new)

plot_roc_curve(fpr, tpr)


print('\033[1m'+"AUC Score \n"+'\033[0m', round(roc_auc_score(y_test, probs_new),2))