# Medicare Fraud Detection

### BigData Query for cms-medicare data

In [None]:
from google.cloud import bigquery
client = bigquery.Client(project='bigquery-public-data')

In [None]:
medicare_ref = client.dataset("cms_medicare")
medicare = client.get_dataset(medicare_ref)

In [None]:
list(map(lambda table: table.table_id, client.list_tables(medicare)))

In [None]:
table_ref = medicare_ref.table('physicians_and_other_supplier_2015')
table = client.get_table(table_ref)

In [None]:
table.schema

In [None]:
df = client.list_rows(table, max_results=100000).to_dataframe()

In [None]:
df.head(10)

In [None]:
df.groupby(['nppes_entity_code', 'medicare_participation_indicator']).count()

In [None]:
df.loc[df.medicare_participation_indicator=='N']

In [None]:
df.groupby(['place_of_service']).count()

In [None]:
df.groupby(['nppes_entity_code']).count()

In [None]:
df.groupby(['npi', 'provider_type']).count()

### Handling LEIE data

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
leie_data = pd.read_csv('../input/dataset/UPDATED.csv')

In [None]:
leie_data.head(10)

In [None]:
fraud_data = leie_data.loc[leie_data.NPI!=0].loc[leie_data.EXCLDATE>20150630]

In [None]:
fraud_data.head(10)

In [None]:
fraud_data.count()

### Required feature table preparation

In [None]:
individual_provider_data = df.loc[df.nppes_entity_code=='I'].loc[df.medicare_participation_indicator=='Y']
individual_provider_data.count()

In [None]:
cols = ['npi', 'bene_unique_cnt', 'bene_day_srvc_cnt', 'line_srvc_cnt', 'average_medicare_allowed_amt', 'average_submitted_chrg_amt', 'average_medicare_payment_amt', 'average_medicare_standard_amt']
all_data = individual_provider_data[cols]

In [None]:
all_data.head(10)

Convert numeric data to numeric type

In [None]:
print(all_data.dtypes)
all_data = all_data.apply(pd.to_numeric, errors="ignore")
dataset = all_data.loc[all_data.npi!=0]

In [None]:
dataset.head(10)

Checking number of tuples having null values

In [None]:
dataset.isnull().sum()

In [None]:
dataset.count()

Aggregating expenses by npi of the provider

In [None]:
dataset = dataset.groupby(['npi'], as_index=False).agg('sum')

In [None]:
dataset.head(10)

In [None]:
dataset.count()

Introducing 'fraud' label

In [None]:
dataset['fraud'] = 0

Labelling fraud records

In [None]:
fraud_npi = set(fraud_data.NPI)
dataset.loc[dataset.npi.isin(fraud_npi), 'fraud'] = 1

In [None]:
dataset.columns

In [None]:
dataset.head(10)

In [None]:
dataset.count()

In [None]:
dataset.loc[dataset.fraud==1].head(10)

In [None]:
dataset.loc[dataset.fraud==1].count()

Removing npi column

In [None]:
dataset = dataset.iloc[:, 1:]

In [None]:
dataset.columns

In [None]:
dataset.head(10)

### Splitting input features and corresponding labels

In [None]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X[:5]

In [None]:
y_train = y

In [None]:
X_train = X

In [None]:
fraud = np.count_nonzero(y_train)
legit = y_train.shape[0]-fraud

In [None]:
legit

In [None]:
fraud

# Experiment Section

In [None]:
from imblearn.pipeline import Pipeline, make_pipeline

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
pipeline = make_pipeline(StandardScaler(), SMOTE(), RandomForestClassifier())

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
evaluation = cross_validate(pipeline, X_train, y_train, scoring=['f1','accuracy','roc_auc'], cv=3)

In [None]:
evaluation

---------------------

### Handling unbalanced class

#### No data resampling

### Scaling numeric input data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_trs = scaler.fit_transform(X_train)

In [None]:
X_trs[:5]

### Building and training model

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
classifier = GaussianNB()

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [None]:
evaluation = cross_validate(classifier, X_trs, y_train, cv=10, scoring=['roc_auc', 'f1_weighted', 'precision_weighted', 'recall_weighted'])
for metric in evaluation:
    print(metric, evaluation[metric].mean())

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier()

In [None]:
evaluation = cross_validate(classifier, X_trs, y_train, cv=5, scoring=['roc_auc', 'f1_weighted', 'precision_weighted', 'recall_weighted', 'f1', 'precision', 'recall'], return_estimator=True)
for metric in evaluation:
    if metric!='estimator':
        print(metric, evaluation[metric].mean())

In [None]:
feature_importances = evaluation['estimator'][-1].feature_importances_

In [None]:
plt.barh(dataset.iloc[:,:-1].columns, feature_importances)

-----------------------------------------------------------------------------------------------------

#### Random Under Sampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
resampler = RandomUnderSampler(sampling_strategy=0.5)

In [None]:
X_tr, y_tr = resampler.fit_resample(X_train, y_train)

In [None]:
X_tr.shape

In [None]:
fraud_tr = np.count_nonzero(y_tr)
legit_tr = y_tr.shape[0]-fraud_tr

In [None]:
legit_tr

In [None]:
fraud_tr

### Scaling numeric input data

In [None]:
X_trs = scaler.fit_transform(X_tr)

In [None]:
X_trs[:5]

### Building and training model

#### Naive Bayes

In [None]:
classifier = GaussianNB()

In [None]:
evaluation = cross_validate(classifier, X_trs, y_tr, cv=5, scoring=['roc_auc', 'f1_weighted', 'precision_weighted', 'recall_weighted', 'f1', 'precision', 'recall'])
for metric in evaluation:
    print(metric, evaluation[metric].mean())

#### Random Forest

In [None]:
classifier = RandomForestClassifier()

In [None]:
evaluation = cross_validate(classifier, X_trs, y_tr, cv=5, scoring=['roc_auc', 'f1_weighted', 'precision_weighted', 'recall_weighted', 'f1', 'precision', 'recall'], return_estimator=True)
for metric in evaluation:
    if metric!='estimator':
        print(metric, evaluation[metric].mean())

In [None]:
feature_importances = evaluation['estimator'][-1].feature_importances_

In [None]:
plt.barh(dataset.iloc[:,:-1].columns, feature_importances)

-----------------------------------------------------------------------------------------------------

#### Random Over Sampling

##### ADASYN

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_train, y_train, test_size=0.2)

In [None]:
from imblearn.over_sampling import ADASYN
resampler = ADASYN(sampling_strategy=0.5)

In [None]:
X_tr, y_tr = resampler.fit_resample(Xtrain, ytrain)

In [None]:
X_tr.shape

In [None]:
fraud_tr = np.count_nonzero(y_tr)
legit_tr = y_tr.shape[0]-fraud_tr

In [None]:
legit_tr

In [None]:
fraud_tr

### Scaling numeric input data

In [None]:
X_trs = scaler.fit_transform(X_tr)
Xtest = scaler.transform(Xtest)

In [None]:
X_trs[:5]

### Building and training model

#### Naive Bayes

In [None]:
classifier = GaussianNB()

In [None]:
evaluation = cross_validate(classifier, X_trs, y_tr, cv=5, scoring=['roc_auc', 'f1_weighted', 'precision_weighted', 'recall_weighted', 'f1', 'precision', 'recall'])
for metric in evaluation:
    print(metric, evaluation[metric].mean())

#### Random Forest

In [None]:
classifier = RandomForestClassifier()

In [None]:
evaluation = cross_validate(classifier, X_trs, y_tr, cv=3, scoring=['roc_auc', 'f1_weighted', 'precision_weighted', 'recall_weighted', 'f1', 'precision', 'recall'], return_estimator=True)
for metric in evaluation:
    if metric!='estimator':
        print(metric, evaluation[metric].mean())

In [None]:
model = evaluation['estimator'][-1]

In [None]:
feature_importances = model.feature_importances_

In [None]:
plt.barh(dataset.iloc[:,:-1].columns, feature_importances)

In [None]:
y_score = model.predict_proba(Xtest)[:,1]
fpr, tpr, thresholds = roc_curve(ytest, y_score)
print('AUC: {}'.format(auc(fpr, tpr)))

In [None]:
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='-')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
barWidth = 0.25

# set height of bar
f1 = [0.4104784852074201, 0.9952031306692811, 0.7455215205717257]
roc = [0.677015520697708, 0.9999310415789591, 0.9400921217771855]
 
# Set position of bar on X axis
br1 = np.arange(len(f1))
br2 = [x + barWidth for x in br1]
 
# Make the plot
plt.bar(br1, f1, color ='#1f77b4', width = barWidth,
        edgecolor ='grey', label ='f1-score')
plt.bar(br2, roc, color ='#7f7f7f', width = barWidth,
        edgecolor ='grey', label ='roc-auc-score')
 
# Adding Xticks
plt.xlabel('Random Forest')
plt.ylabel('Score (0-1)')
plt.xticks([r + barWidth/2 for r in range(len(f1))],
        ['RUS', 'SMOTE', 'ADASYN'])
 
plt.legend()
plt.show()

In [None]:
barWidth = 0.25

# set height of bar
f1 = [0.1711651095702235, 0.1960470111938177, 0.18019022668742624]
roc = [0.5442282055186182, 0.5989925951759163, 0.5819544779304673]
 
# Set position of bar on X axis
br1 = np.arange(len(f1))
br2 = [x + barWidth for x in br1]
 
# Make the plot
plt.bar(br1, f1, color ='#1f77b4', width = barWidth,
        edgecolor ='grey', label ='f1-score')
plt.bar(br2, roc, color ='#7f7f7f', width = barWidth,
        edgecolor ='grey', label ='roc-auc-score')
 
# Adding Xticks
plt.xlabel('Naive Bayes')
plt.ylabel('Score (0-1)')
plt.xticks([r + barWidth/2 for r in range(len(f1))],
        ['RUS', 'SMOTE', 'ADASYN'])
 
plt.legend()
plt.show()