In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score



In [3]:
df = pd.read_excel(r"C:\Users\Hp\anomaly_detection\Updated_Synthetic_Caseauditdetails.xlsx")
df.head()

Unnamed: 0,#,userid,formatteddisplayname,email,npi,user region,associationtype,rolename,organizationid,caseid,...,audittype,accountid,displayname,facility state,status,orgid,organizationname,city,fraud,fraud_reason
0,1,22890000,Hellen Kuria,hellenkuria@example.com,UNKNOWN,US,Ordering Facility,Pathologist Assistant,15000,8200000,...,Cases,50000,Organization 1,MA,active,15000,"Leominster Dermatology, LLP",Hudson,0,
1,2,22890000,Hellen Kuria,hellenkuria@example.com,UNKNOWN,US,Ordering Facility,Pathologist Assistant,15000,8200000,...,Cases,50000,Organization 1,MA,active,15000,"Leominster Dermatology, LLP",Hudson,0,
2,3,22890000,Hellen Kuria,hellenkuria@example.com,UNKNOWN,US,Ordering Facility,Pathologist Assistant,15000,8200000,...,Cases,50000,Organization 1,MA,active,15000,"Leominster Dermatology, LLP",Hudson,1,Timeline anomaly: future or illogical audit date
3,4,22890000,Hellen Kuria,hellenkuria@example.com,UNKNOWN,US,Ordering Facility,Pathologist Assistant,15000,8200000,...,Cases,50000,Organization 1,MA,active,15000,"Leominster Dermatology, LLP",Hudson,1,Numeric anomaly: abnormal timetaken
4,5,22890000,Hellen Kuria,hellenkuria@example.com,UNKNOWN,US,Ordering Facility,Pathologist Assistant,15000,8200000,...,Cases,50000,Organization 1,MA,active,15000,"Leominster Dermatology, LLP",Hudson,0,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   #                     1200 non-null   int64 
 1   userid                1200 non-null   int64 
 2   formatteddisplayname  1200 non-null   object
 3   email                 1200 non-null   object
 4   npi                   1200 non-null   object
 5   user region           1200 non-null   object
 6   associationtype       1200 non-null   object
 7   rolename              1200 non-null   object
 8   organizationid        1200 non-null   int64 
 9   caseid                1200 non-null   int64 
 10  auditid               1200 non-null   int64 
 11  actionperformed       1200 non-null   object
 12  actiondetails         1200 non-null   object
 13  timetaken             1200 non-null   int64 
 14  auditdate             1200 non-null   object
 15  casestatus            1200 non-null   

In [5]:
df.columns

Index(['#', 'userid', 'formatteddisplayname', 'email', 'npi', 'user region',
       'associationtype', 'rolename', 'organizationid', 'caseid', 'auditid',
       'actionperformed', 'actiondetails', 'timetaken', 'auditdate',
       'casestatus', 'audittype', 'accountid', 'displayname', 'facility state',
       'status', 'orgid', 'organizationname', 'city', 'fraud', 'fraud_reason'],
      dtype='object')

In [6]:
# columns to drop
df = df.drop(['#', 'organizationid','accountid','displayname','orgid','fraud_reason'], axis=1)

In [7]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

userid                  0
formatteddisplayname    0
email                   0
npi                     0
user region             0
associationtype         0
rolename                0
caseid                  0
auditid                 0
actionperformed         0
actiondetails           0
timetaken               0
auditdate               0
casestatus              0
audittype               0
facility state          0
status                  0
organizationname        0
city                    0
fraud                   0
dtype: int64


In [8]:
import pandas as pd

# Assuming df is your DataFrame
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)


Numerical columns: ['userid', 'caseid', 'auditid', 'timetaken', 'fraud']
Categorical columns: ['formatteddisplayname', 'email', 'npi', 'user region', 'associationtype', 'rolename', 'actionperformed', 'actiondetails', 'auditdate', 'casestatus', 'audittype', 'facility state', 'status', 'organizationname', 'city']


In [9]:
X = df.drop(['userid','caseid','auditid','fraud'], axis = 1)
y = df['fraud']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)


In [11]:
X_train.shape

(840, 16)

In [12]:
X_test.shape

(360, 16)

In [13]:
# Encoding categorical variables using LabelEncoder 
categorical_cols = X_train.select_dtypes(include=['object']).columns
print(categorical_cols)
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])

Index(['formatteddisplayname', 'email', 'npi', 'user region',
       'associationtype', 'rolename', 'actionperformed', 'actiondetails',
       'auditdate', 'casestatus', 'audittype', 'facility state', 'status',
       'organizationname', 'city'],
      dtype='object')


In [14]:
print(pd.Series(y_train).value_counts())

fraud
0    504
1    336
Name: count, dtype: int64


In [15]:
print(pd.Series(y_test).value_counts())

fraud
0    216
1    144
Name: count, dtype: int64


In [16]:
rf_model = RandomForestClassifier(
    n_estimators = 100,
    max_depth = 10,
    min_samples_split=5,
    random_state=42
)

In [17]:
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='f1')
print(f"Cross-validation accuracies: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores):.4f} +/- {np.std(cv_scores):.4f}")

Cross-validation accuracies: [1.        1.        1.        1.        0.9924812]
Mean accuracy: 0.9985 +/- 0.0030


In [18]:
rf_model.fit(X_train, y_train)

In [19]:
# Encoding categorical variables using LabelEncoder 
categorical_cols = X_test.select_dtypes(include=['object']).columns
print(categorical_cols)
for col in categorical_cols:
    le = LabelEncoder()
    X_test[col] = le.fit_transform(X_test[col])

Index(['formatteddisplayname', 'email', 'npi', 'user region',
       'associationtype', 'rolename', 'actionperformed', 'actiondetails',
       'auditdate', 'casestatus', 'audittype', 'facility state', 'status',
       'organizationname', 'city'],
      dtype='object')


In [20]:
y_pred = rf_model.predict(X_test)

In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       216
           1       1.00      0.77      0.87       144

    accuracy                           0.91       360
   macro avg       0.93      0.89      0.90       360
weighted avg       0.92      0.91      0.91       360



In [33]:
# Instantiate Isolation Forest with expected contamination rate same as fraud proportion in training set
iso_forest = IsolationForest(contamination=0.3, random_state=42)
iso_forest.fit(X_train)

# Predict anomalies on test set (-1 means anomaly, 1 means normal)
preds = iso_forest.predict(X_test)

In [34]:
# Convert predictions: anomaly (-1) to 1 (fraud), normal (1) to 0 (non-fraud)
pred_labels = [1 if p == -1 else 0 for p in preds]

# Evaluate the predictions
accuracy = accuracy_score(y_test, pred_labels)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, pred_labels))

Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.78      0.79       216
           1       0.68      0.70      0.69       144

    accuracy                           0.75       360
   macro avg       0.74      0.74      0.74       360
weighted avg       0.75      0.75      0.75       360

