<a href="https://colab.research.google.com/github/rodayna-moamen/fraud_detection_project/blob/main/02_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


In [2]:
import pandas as pd

# Load the train and test data
train_df = pd.read_csv('/content/train_processed.csv')
test_df = pd.read_csv('/content/test_processed.csv')

# Inspect the first few rows of the train and test data
print(train_df.head())
print(test_df.head())


   Provider  TotalClaims  UniquePatients  TotalReimbursed  AvgReimbursed  \
0  PRV51001           25              24           104640    4185.600000   
1  PRV51003          132             117           605670    4588.409091   
2  PRV51004          149             138            52170     350.134228   
3  PRV51005         1165             495           280910     241.124464   
4  PRV51007           72              58            33710     468.194444   

   StdReimbursed  MaxReimbursed  MinReimbursed  TotalDeductible  \
0   10796.091144          42000             10           5340.0   
1    7309.794729          57000              0          66286.0   
2     689.963754           3300              0            310.0   
3     491.556392           4080              0           3700.0   
4    1433.769116          10000              0           3264.0   

   AvgDeductible  ...  Cancer_Rate  ObstrPulmonary_Rate  Depression_Rate  \
0     213.600000  ...     0.200000             0.400000         

In [8]:
# Identify numerical and categorical features
numeric_features = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_df.select_dtypes(include=['object']).columns

# Ensure the target column 'Fraud' is not included in the features for X
if 'Fraud' in numeric_features:
    numeric_features = numeric_features.drop('Fraud')
elif 'Fraud' in categorical_features:
    categorical_features = categorical_features.drop('Fraud')

In [10]:
# Preprocessing pipeline for numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values with mean imputation
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Preprocessing pipeline for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine both pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [11]:
# Final preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [13]:
# Define X and y
X_train = train_df.drop(columns=['Fraud'])  # Features (excluding the target)
y_train = train_df['Fraud']  # Target (Fraud labels)

# Apply the preprocessing pipeline to X_train
X_train_processed = preprocessing_pipeline.fit_transform(X_train)

# Initialize SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Apply SMOTE to balance the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

# Check the class distribution after SMOTE
print("Class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

Class distribution after SMOTE:
Fraud
0    4904
1    4904
Name: count, dtype: int64


In [14]:
# Split the resampled training data into train and validation sets (80/20 split)
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42, stratify=y_train_resampled)

# Check the shape of the splits
print(X_train_final.shape, X_val.shape)


(7846, 5438) (1962, 5438)


In [15]:
# Initialize and train Logistic Regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_final, y_train_final)

# Make predictions on the validation set
y_pred_lr = lr_model.predict(X_val)

# Evaluate the Logistic Regression model
print("Logistic Regression Metrics:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_lr)}")
print(f"Precision: {precision_score(y_val, y_pred_lr)}")
print(f"Recall: {recall_score(y_val, y_pred_lr)}")
print(f"F1 Score: {f1_score(y_val, y_pred_lr)}")
print(f"ROC AUC: {roc_auc_score(y_val, y_pred_lr)}")


Logistic Regression Metrics:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC AUC: 1.0


In [16]:
# Initialize and train Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_final, y_train_final)

# Make predictions on the validation set
y_pred_dt = dt_model.predict(X_val)

# Evaluate the Decision Tree model
print("Decision Tree Metrics:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_dt)}")
print(f"Precision: {precision_score(y_val, y_pred_dt)}")
print(f"Recall: {recall_score(y_val, y_pred_dt)}")
print(f"F1 Score: {f1_score(y_val, y_pred_dt)}")
print(f"ROC AUC: {roc_auc_score(y_val, y_pred_dt)}")


Decision Tree Metrics:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC AUC: 1.0


In [17]:
# Evaluation function
def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    print(f"Accuracy: {accuracy_score(y_val, y_pred)}")
    print(f"Precision: {precision_score(y_val, y_pred)}")
    print(f"Recall: {recall_score(y_val, y_pred)}")
    print(f"F1 Score: {f1_score(y_val, y_pred)}")
    print(f"ROC AUC: {roc_auc_score(y_val, y_pred)}")

# Evaluate Logistic Regression and Decision Tree
evaluate_model(lr_model, X_val, y_val)
evaluate_model(dt_model, X_val, y_val)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC AUC: 1.0
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC AUC: 1.0
