# Medicare Fraud Detection

**BigData Query for cms-medicare data**

In [None]:
# BigQuery
from google.cloud import bigquery
client = bigquery.Client(project='bigquery-public-data')

In [None]:
medicare_ref = client.dataset("cms_medicare")
medicare = client.get_dataset(medicare_ref)

In [None]:
list(map(lambda table: table.table_id, client.list_tables(medicare)))

In [None]:
table_ref = medicare_ref.table('physicians_and_other_supplier_2015')
table = client.get_table(table_ref)

In [None]:
table.schema

In [None]:
#max time needed to load dataset
df = client.list_rows(table, max_results=1000000).to_dataframe()

In [None]:
df.head(10)

In [None]:
"""table_ref1 = medicare_ref.table('physicians_and_other_supplier_2014')
table1 = client.get_table(table_ref1)
df1 = client.list_rows(table1).to_dataframe()
df1.to_csv('physicians_and_other_supplier_2014', index=False)"""

In [None]:
"""table_ref2 = medicare_ref.table('physicians_and_other_supplier_2013')
table2 = client.get_table(table_ref2)
df2 = client.list_rows(table2).to_dataframe()
df2.to_csv('physicians_and_other_supplier_2013', index=False)"""

In [None]:
"""table_ref3 = medicare_ref.table('physicians_and_other_supplier_2012')
table3 = client.get_table(table_ref3)
df3 = client.list_rows(table3).to_dataframe()
df3.to_csv('physicians_and_other_supplier_2012', index=False)"""

**LEIE data handle**

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
fraud_data = pd.read_csv('../input/dataset/UPDATED.csv')
excluded_data = fraud_data.loc[fraud_data.WAIVERDATE > 20140630].loc[fraud_data.WAIVERDATE < 20160630]
exclude_waverNPI =  list(excluded_data.NPI)
# exclude_waverNPI

In [None]:
fraud_data.head(10)
#.loc[REINDATE != 0]
#

In [None]:
fraud_data = fraud_data.loc[fraud_data.NPI!=0].loc[fraud_data.EXCLDATE>20140630]

In [None]:
fraud_data.head(10)

In [None]:
fraud_data.groupby('NPI').agg('count')

In [None]:
fraud_data.count()

**Input feature table**

Required input features

In [None]:
cols = ['npi', 'line_srvc_cnt', 'average_medicare_allowed_amt', 'average_submitted_chrg_amt', 'average_medicare_payment_amt', 'average_medicare_standard_amt']
all_data = df[cols]

In [None]:
all_data.head(20)

Convert numeric data to numeric type

In [None]:
print(all_data.dtypes)
all_data = all_data.apply(pd.to_numeric, errors="ignore")
dataset = all_data.loc[all_data.npi!=0]

In [None]:
dataset.head(10)

In [None]:
dataset.isnull().sum()

In [None]:
dataset = dataset.groupby('npi', as_index=False).agg('sum')

In [None]:
dataset.head(10)

**Introducing label fraud**

In [None]:
dataset['fraud'] = 0

Labelling fraud records

In [None]:
final_fraud_npi = set(fraud_data.NPI) - set(exclude_waverNPI)
fraud_npi = list(final_fraud_npi)
dataset.fraud.loc[dataset.npi.isin(fraud_npi)] = 1
#print(fraud_npi)

In [None]:
dataset.columns

In [None]:
dataset.head(10)

In [None]:
dataset.count()

In [None]:
dataset.loc[dataset.fraud==1].count()
print(dataset.fraud.loc[dataset.npi.isin(fraud_npi)])

In [None]:
#dataset.to_csv('physicians_and_other_supplier_2015_100K.csv'), index=False)

In [None]:
dataset = dataset.iloc[:, 1:]

In [None]:
dataset.columns

In [None]:
dataset.head(10)

**Splitting input and output data**

In [None]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X[:5]

In [None]:
y

**Splitting training and testing data**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
fraud = np.count_nonzero(y_train)
legit = y_train.shape[0]-fraud

In [None]:
legit

In [None]:
fraud

**Handling unbalanced class**

In [None]:
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler(sampling_strategy=0.7)
X_train_after, y_train_after = under_sampler.fit_resample(X_train, y_train)

In [None]:
X_train_after.shape

In [None]:
fraud_after = np.count_nonzero(y_train_after)
legit_after = y_train_after.shape[0]-fraud_after

In [None]:
legit_after

In [None]:
fraud_after

**Scaling numeric input data**

In [None]:
"""from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)"""

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_after)

In [None]:
X_train_scaled

**Building and training model**

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train_scaled, y_train_after)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier1 = RandomForestClassifier()
classifier1.fit(X_train_scaled, y_train_after)

In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
fraud_test = np.count_nonzero(y_test)
legit_test = y_test.shape[0] - fraud_test

In [None]:
legit_test

In [None]:
fraud_test

**Predicting fraud from test data**

In [None]:
y_pred = classifier.predict(X_test_scaled)
y_pred

In [None]:
y_pred1 = classifier1.predict(X_test_scaled)
y_pred1

**Evaluation metrics**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
cm1 = confusion_matrix(y_test, y_pred1)
print(cm1)
accuracy_score(y_test, y_pred1)

In [None]:
roc_auc_score(y_test, y_pred1)

In [None]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_pred)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

In [None]:
average_precision1 = average_precision_score(y_test, y_pred1)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision1))