In [27]:
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# Machine Learning Models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix

from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

from sklearn.metrics import classification_report, accuracy_score
import random
random.seed(100)

import time
import pyodbc
print(pyodbc.drivers())

['SQL Server', 'ODBC Driver 17 for SQL Server', 'SQL Server Native Client RDA 11.0', 'Microsoft Access Driver (*.mdb, *.accdb)', 'Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)', 'Microsoft Access Text Driver (*.txt, *.csv)', 'Microsoft Access dBASE Driver (*.dbf, *.ndx, *.mdx)']


### Setting up SQL database

In [2]:
def create_sql_connection(server, database, username, password, driver='{ODBC Driver 17 for SQL Server}'):
    """
    Establish a connection to a SQL Server database using pyodbc.

    Parameters:
    - server (str): The SQL Server address (e.g., 'localhost' or server IP).
    - database (str): The name of the database you want to connect to.
    - username (str): SQL Server username.
    - password (str): SQL Server password.
    - driver (str): ODBC driver to use. Default is '{ODBC Driver 17 for SQL Server}'.

    Returns:
    - conn: A pyodbc connection object if successful.
    """
    connection_string = f"""
        DRIVER={driver};
        SERVER={server};
        DATABASE={database};
        UID={username};
        PWD={password};
    """
    try:
        conn = pyodbc.connect(connection_string)
        print("Connection established successfully!")
        return conn
    except Exception as e:
        print(f"Failed to connect to the database. Error: {e}")
        return None

In [3]:
def query_data(conn, query):
    """
    Execute a SQL query and fetch results as a pandas DataFrame.
    
    Parameters:
    - conn: A pyodbc connection object.
    - query (str): The SQL query to be executed.
    
    Returns:
    - df: A pandas DataFrame containing the query result.
    """
    start_time = time.time()  # Start time measurement
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        
        # Fetch all results from the query
        rows = cursor.fetchall()
        
        # Get column names from cursor
        columns = [desc[0] for desc in cursor.description]
        
        # Create a pandas DataFrame from the results
        df = pd.DataFrame.from_records(rows, columns=columns)
        
    except pyodbc.Error as e:
        print(f"Error executing query: {e}")
        return None
    
    finally:
        cursor.close()
    
    end_time = time.time()  # End time measurement
    execution_time = end_time - start_time  # Calculate execution time
    
    # Print the DataFrame and execution time
    print(f"Query executed in: {execution_time:.4f} seconds")
    
    return df  

In [4]:
server = 'ROHIT'     
database = 'MedicareClaim'  
username = 'rohit_kosamkar'       
password = 'September@2024' 

In [5]:
# Establish connection
conn = create_sql_connection(server, database, username, password)

Connection established successfully!


In [6]:
bene_df  = query_data(conn, "select top 10 * from beneficiarydata")
bene_df.head()

Query executed in: 0.0245 seconds


Unnamed: 0,BeneID,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt
0,BENE100000,1938-03-01,,1,1,0,49,430,12,12,2,2,2,2,2,2,2,1,2,2,2,0,0,120,30
1,BENE100001,1939-08-01,,1,1,0,33,420,12,12,1,2,2,2,2,2,2,1,1,2,1,0,0,2530,540
2,BENE100002,1938-09-01,,2,2,0,33,20,12,12,1,1,2,2,1,1,1,1,2,2,1,12250,1068,1760,660
3,BENE100003,1950-06-01,,2,3,0,22,90,12,12,2,2,2,2,2,2,2,1,2,2,2,0,0,300,20
4,BENE100004,1943-06-01,,2,1,Y,15,210,12,12,2,1,1,2,1,2,1,1,1,2,2,14270,2136,1880,700


### Set Up MLflow Tracking with SQL Database

In [24]:
import mlflow
import mlflow.sklearn
from sqlalchemy import create_engine

try:
    # Set MLflow Tracking URI using SQL Server and Windows Authentication
    mlflow.set_tracking_uri("mssql+pyodbc://ROHIT/MedicareClaim?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes")
 
    # Name the experiment
    mlflow.set_experiment("MedicareClaim_Fraud_Detection")
except Exception as e:
    print(f"Connection failed: {e}")

2024/10/04 18:28:40 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/10/04 18:28:41 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl MSSQLImpl.
INFO  [alembic.runtime.migration] Will assume transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  [alem

In [26]:
import mlflow
import mlflow.sklearn
from sqlalchemy import create_engine, text
import pandas as pd

# Use a raw string to handle backslashes in the server name
mlflow_tracking_uri = (
    r"mssql+pyodbc://ROHIT/MedicareClaim"
    "?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes"
)
 
# Set the MLflow Tracking URI
mlflow.set_tracking_uri(mlflow_tracking_uri)
 
try:
    # Create an SQLAlchemy engine
    engine = create_engine(mlflow_tracking_uri)
 
    # Test the connection by running a simple query
    with engine.connect() as connection:
        # Use the text() function to create a SQL statement
        query = text("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES")
        result = connection.execute(query)
 
        # Fetch all the results and display them
        tables = result.fetchall()
 
        # Print the list of tables
        print("Connection successful! Here are the tables in the database:")
        for table in tables:
            print(table[0])
 
    # Name the experiment (if connection is successful)
    mlflow.set_experiment("MedicareClaim_Fraud_Detection")
 
except Exception as e:
    print(f"Connection failed: {e}")

Connection successful! Here are the tables in the database:
Beneficiarydata
Inpatientdata
Outpatientdata
experiments
runs
tags
metrics
params
alembic_version
experiment_tags
latest_metrics
registered_models
model_versions
registered_model_tags
model_version_tags
registered_model_aliases
datasets
inputs
input_tags
trace_info
trace_tags
trace_request_metadata


### Importing data

In [111]:
data = pd.read_csv(r'../data/interim/model_data.csv')
data.shape

(558138, 35)

In [61]:
# # Separate the classes
# df_class_0 = data[data['PotentialFraud'] == 0]
# df_class_1 = data[data['PotentialFraud'] == 1]

# # Define the number of samples you want to keep from class 0
# num_class_1 = len(df_class_1)
# df_class_0_balanced = df_class_0.sample(num_class_1, random_state=42)  # Randomly sample from class 0

# # Combine the balanced classes
# df_balanced = pd.concat([df_class_0_balanced, df_class_1])

# # Shuffle the dataset to mix the classes
# df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [63]:
# data_encoded.drop(columns={'ClaimID', 'Provider'}).corr().to_clipboard()

In [112]:
data['PotentialFraud'].value_counts()

PotentialFraud
0    345369
1    212769
Name: count, dtype: int64

In [113]:
### One-hot encoding
cat_cols = ['SamePhysician', 'OPD_Flag', 'Gender',
       'Race', 'RenalDiseaseIndicator', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke']

data_encoded = pd.get_dummies(data, columns = cat_cols,drop_first=True)
data_encoded.shape

(558138, 37)

In [114]:
data_encoded.head(2)

Unnamed: 0.1,Unnamed: 0,ClaimID,Provider,InscClaimAmtReimbursed,DeductibleAmtPaid,ClaimPeriod,TimeInHptal,Diagnosis Count,Procedures Count,PotentialFraud,FraudHistory,NoOfMonths_PartACov,NoOfMonths_PartBCov,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Age,ChronicDisease_Count,SamePhysician_Yes,OPD_Flag_Yes,Gender_Male,Race_Hispanic,Race_Other,Race_White,RenalDiseaseIndicator_Yes,ChronicCond_Alzheimer_Yes,ChronicCond_Heartfailure_Yes,ChronicCond_KidneyDisease_Yes,ChronicCond_Cancer_Yes,ChronicCond_ObstrPulmonary_Yes,ChronicCond_Depression_Yes,ChronicCond_Diabetes_Yes,ChronicCond_IschemicHeart_Yes,ChronicCond_Osteoporasis_Yes,ChronicCond_rheumatoidarthritis_Yes,ChronicCond_stroke_Yes
0,0,CLM46614,PRV55912,690,1068,6,6,9,0,1,1,12,12,15000,2670,60,70,67,7,False,False,True,False,False,True,False,True,False,True,False,False,True,True,True,False,True,True
1,1,CLM66048,PRV55907,690,1068,2,2,3,1,0,1,12,12,15000,2670,60,70,67,7,True,False,True,False,False,True,False,True,False,True,False,False,True,True,True,False,True,True


In [115]:
del data_encoded['Unnamed: 0']

In [116]:
data_encoded['PotentialFraud'].value_counts(normalize=True)

PotentialFraud
0    0.618788
1    0.381212
Name: proportion, dtype: float64

In [117]:
X = data_encoded.drop(columns='PotentialFraud')
# X = data_encoded[['ClaimID', 'Provider','OPAnnualReimbursementAmt',
#  'OPAnnualDeductibleAmt',
#  'Age',
#  'InscClaimAmtReimbursed',
#  'Diagnosis Count',
#  'ChronicDisease_Count',
#  'IPAnnualReimbursementAmt',
#  'ClaimPeriod',
#  'Gender_Male',
#  'ChronicCond_Alzheimer_Yes',
#  'ChronicCond_Osteoporasis_Yes',
#  'ChronicCond_Heartfailure_Yes']]
y = data_encoded['PotentialFraud']

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42, stratify=y)
print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)

X_train: (390696, 35)
X_test: (167442, 35)
y_train: (390696,)
y_test: (167442,)


### MLFLOW

In [119]:
# Set the experiment name
experiment_name = "MedicareClaim_Fraud_Detection"
mlflow.set_experiment(experiment_name)  # Set or create the experiment


<Experiment: artifact_location='file:///d:/workspace/git_projects/Medicare-Claim-Fraud-Detection/scripts/mlruns/1', creation_time=1728080922230, experiment_id='1', last_update_time=1728080922230, lifecycle_stage='active', name='MedicareClaim_Fraud_Detection', tags={}>

In [120]:
# List of models to evaluate
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [121]:
def evaluate_models(models, X_train, X_test, y_train, y_test):
    # Drop non-predictive columns
    X_train_processed = X_train.drop(['ClaimID', 'Provider'], axis=1)
    X_test_processed = X_test.drop(['ClaimID', 'Provider'], axis=1)

    # Iterate through each model
    for model_name, model in models.items():
        with mlflow.start_run():
            # Train the models
            model.fit(X_train_processed, y_train)

            # Make predictions
            y_train_pred = model.predict(X_train_processed)
            y_test_pred = model.predict(X_test_processed)

            # Calculate metrics
            metrics = {
                'accuracy': {
                    'train': accuracy_score(y_train, y_train_pred),
                    'test': accuracy_score(y_test, y_test_pred)
                },
                'precision': {
                    'train': precision_score(y_train, y_train_pred),
                    'test': precision_score(y_test, y_test_pred)
                },
                'recall': {
                    'train': recall_score(y_train, y_train_pred),
                    'test': recall_score(y_test, y_test_pred)
                },
                'roc_auc': {
                    'train': roc_auc_score(y_train, model.predict_proba(X_train_processed)[:, 1]),
                    'test': roc_auc_score(y_test, model.predict_proba(X_test_processed)[:, 1])
                }
            }

            # Log metrics with model name included
            for metric_name, metric_values in metrics.items():
                mlflow.log_metric(f"{model_name}_train_{metric_name}", metric_values['train'])
                mlflow.log_metric(f"{model_name}_test_{metric_name}", metric_values['test'])

            # Log feature importance if applicable
            if hasattr(model, 'feature_importances_'):
                feature_importances = model.feature_importances_
            elif hasattr(model, 'coef_'):
                feature_importances = model.coef_[0]
            else:
                feature_importances = None

            if feature_importances is not None:
                # Create a DataFrame for better logging
                importance_df = pd.DataFrame({
                    'Feature': X_train_processed.columns,
                    'Importance': feature_importances
                }).sort_values(by='Importance', ascending=False)

                # Log the feature importances as an artifact
                importance_file_path = f"{model_name}_feature_importances.csv"
                importance_df.to_csv(importance_file_path, index=False)
                mlflow.log_artifact(importance_file_path)

            # Log the model
            mlflow.sklearn.log_model(model, model_name)

            # End the run
            mlflow.end_run()

In [122]:
# Call the function to evaluate the models
evaluate_models(models, X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



##### Run this in CMD:

mlflow ui --backend-store-uri "mssql+pyodbc://ROHIT/MedicareClaim?driver=ODBC+Driver+17+for+SQL+Server`&trusted_connection=yes"


### trial

In [97]:
# # Random Forest
# start_time = time.time()

# rf_model = RandomForestClassifier(n_estimators= 100,random_state=42)
# rf_model.fit(X_train.drop(['ClaimID', 'Provider'], axis=1), y_train)
# y_pred_rf = rf_model.predict(X_test.drop(['ClaimID', 'Provider'], axis=1))
# end_time = time.time()
# # Time taken
# execution_time = end_time - start_time
# print('Execution time: ', execution_time)
# print("Random Forest Classifier Report:")
# print(classification_report(y_test, y_pred_rf))
# print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")

In [109]:
# Random Forest
start_time = time.time()

# Initialize and fit the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split = 7, random_state=42)
rf_model.fit(X_train.drop(['ClaimID', 'Provider'], axis=1), y_train)

# Predictions
y_pred_rf_train = rf_model.predict(X_train.drop(['ClaimID', 'Provider'], axis=1))
y_pred_rf_test = rf_model.predict(X_test.drop(['ClaimID', 'Provider'], axis=1))

end_time = time.time()
# Time taken
execution_time = end_time - start_time
print('Execution time: ', execution_time)

# Metrics for Training Data
print("\nRandom Forest Classifier Training Report:")
print(classification_report(y_train, y_pred_rf_train))
print(f"Training Accuracy: {accuracy_score(y_train, y_pred_rf_train)}")
print(f"Training Precision: {precision_score(y_train, y_pred_rf_train)}")
print(f"Training Recall: {recall_score(y_train, y_pred_rf_train)}")
print(f"Training F1 Score: {f1_score(y_train, y_pred_rf_train)}")

# Metrics for Test Data
print("\nRandom Forest Classifier Test Report:")
print(classification_report(y_test, y_pred_rf_test))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_rf_test)}")
print(f"Test Precision: {precision_score(y_test, y_pred_rf_test)}")
print(f"Test Recall: {recall_score(y_test, y_pred_rf_test)}")
print(f"Test F1 Score: {f1_score(y_test, y_pred_rf_test)}")

Execution time:  22.997344732284546

Random Forest Classifier Training Report:
              precision    recall  f1-score   support

           0       0.73      0.99      0.84    241758
           1       0.97      0.41      0.58    148938

    accuracy                           0.77    390696
   macro avg       0.85      0.70      0.71    390696
weighted avg       0.82      0.77      0.74    390696

Training Accuracy: 0.7714335442389991
Training Precision: 0.9698125098471719
Training Recall: 0.4132860653426258
Training F1 Score: 0.5795826899175173

Random Forest Classifier Test Report:
              precision    recall  f1-score   support

           0       0.73      0.99      0.84    103611
           1       0.97      0.41      0.58     63831

    accuracy                           0.77    167442
   macro avg       0.85      0.70      0.71    167442
weighted avg       0.82      0.77      0.74    167442

Test Accuracy: 0.7711506073744938
Test Precision: 0.9693847512511039
Test Rec

In [110]:
importances = rf_model.feature_importances_
feature_names = X.drop(['ClaimID', 'Provider'], axis=1).columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)
print(feature_importance_df)

                                Feature  Importance
6                          FraudHistory    0.888056
15                    SamePhysician_Yes    0.016385
16                         OPD_Flag_Yes    0.015597
3                           TimeInHptal    0.013461
1                     DeductibleAmtPaid    0.012137
5                      Procedures Count    0.006569
4                       Diagnosis Count    0.004834
11             OPAnnualReimbursementAmt    0.004670
13                                  Age    0.004624
0                InscClaimAmtReimbursed    0.004490
9              IPAnnualReimbursementAmt    0.003906
12                OPAnnualDeductibleAmt    0.003881
19                           Race_Other    0.003698
2                           ClaimPeriod    0.003533
14                 ChronicDisease_Count    0.002187
10                IPAnnualDeductibleAmt    0.001977
18                        Race_Hispanic    0.001697
17                          Gender_Male    0.000693
20          

In [75]:
feature_importance_df['Feature'][:12].to_list()

['OPAnnualReimbursementAmt',
 'OPAnnualDeductibleAmt',
 'Age',
 'InscClaimAmtReimbursed',
 'Diagnosis Count',
 'ChronicDisease_Count',
 'IPAnnualReimbursementAmt',
 'ClaimPeriod',
 'Gender_Male',
 'ChronicCond_Alzheimer_Yes',
 'ChronicCond_Osteoporasis_Yes',
 'ChronicCond_Heartfailure_Yes']

In [91]:
y_pred_dt = lr_model.predict(X_test.drop(['ClaimID', 'Provider'], axis=1))

print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_dt))
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt)}")

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.63      0.93      0.76    103611
           1       0.54      0.12      0.20     63831

    accuracy                           0.63    167442
   macro avg       0.59      0.53      0.48    167442
weighted avg       0.60      0.63      0.54    167442

Accuracy: 0.6255658675839992
