In [1]:
import pandas as pd

In [2]:
# Optional display settings for better notebook formatting
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 10)       # Limit number of rows shown
pd.set_option('display.width', 1000)        # Width of the display in characters

In [4]:
final_df = pd.read_csv('Basic_Provider_Merge_csv')
final_df

Unnamed: 0,Provider,IP_Claims_Total,OP_Claims_Total,PotentialFraud
0,PRV51001,97000.0,7640.0,No
1,PRV51003,573000.0,32670.0,Yes
2,PRV51004,0.0,52170.0,No
3,PRV51005,0.0,280910.0,Yes
4,PRV51007,19000.0,14710.0,No
...,...,...,...,...
5405,PRV57759,0.0,10640.0,No
5406,PRV57760,0.0,4770.0,No
5407,PRV57761,0.0,18470.0,No
5408,PRV57762,0.0,1900.0,No


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Encode the PotentialFraud column as a binary variable
le = LabelEncoder()
final_df["PotentialFraud"] = le.fit_transform(final_df["PotentialFraud"])  # 1 = Fraud, 0 = Non-Fraud

In [6]:
final_df

Unnamed: 0,Provider,IP_Claims_Total,OP_Claims_Total,PotentialFraud
0,PRV51001,97000.0,7640.0,0
1,PRV51003,573000.0,32670.0,1
2,PRV51004,0.0,52170.0,0
3,PRV51005,0.0,280910.0,1
4,PRV51007,19000.0,14710.0,0
...,...,...,...,...
5405,PRV57759,0.0,10640.0,0
5406,PRV57760,0.0,4770.0,0
5407,PRV57761,0.0,18470.0,0
5408,PRV57762,0.0,1900.0,0


In [7]:
# Define features and target variable
X = final_df[["IP_Claims_Total", "OP_Claims_Total"]]
y = final_df["PotentialFraud"]
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Train a logistic regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
# Predictions
y_pred = log_reg.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [9]:
import numpy as np
# Function to print results in a readable format
def print_model_evaluation(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, digits=2)
    print("\n### Logistic Regression Results ###\n")
    print(f"**Accuracy:** {accuracy:.4f}\n")
    print("**Confusion Matrix:**")
    print(np.array2string(conf_matrix, formatter={'int': '{:4d}'.format}))
    print("\n")
    print("**Classification Report:**")
    print(class_report)
# Call the function after model predictions
print_model_evaluation(y_test, y_pred)


### Logistic Regression Results ###

**Accuracy:** 0.9298

**Confusion Matrix:**
[[ 965   12]
 [  64   41]]


**Classification Report:**
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       977
           1       0.77      0.39      0.52       105

    accuracy                           0.93      1082
   macro avg       0.86      0.69      0.74      1082
weighted avg       0.92      0.93      0.92      1082

