In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Loading the data
df = pd.read_csv("/Users/pavithragunasekaran/Documents/sem_3/Dab311_Deep_Learning/venv/capstone_project/Final data.csv")


# # Map labels: Approved = 1, Denied = 0
# df["Claim Status"] = df["Claim Status"].map({"Denied": 0, "Paid": 1, "Pending":2,"Adjusted":3}) #maps the categories to numbers 


# 2. Separate features X and target Y
X = df.drop(columns=["Claim Status"])
y = df["Claim Status"]

# 3. Identify numeric and categorical columns using dtypes
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = [c for c in X.columns if c not in num_cols]

# 4. Preprocessing
preprocess = ColumnTransformer([ #column transformer is to convert the numerical columns to categorical 
    ("num", SimpleImputer(strategy="median"), num_cols), #it handles missing values and filled with median
    ("cat", Pipeline([ #pipeline is used to combile multiple data-processing and modeling steps into one workflow
        ("encoder", OneHotEncoder(handle_unknown="ignore")) #OneHot encoder is used to convert categorical variables into binary numeric columns (0/1)
    ]), cat_cols)
])

# 5. Logistic Regression model
model = Pipeline([
    ("preprocess", preprocess),
    ("logistic", LogisticRegression(max_iter=5000))
])

# 6. splitting the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y # stratify ensures the categories like Approved vs Denied are evenly represented in both splits.
)

# 7. Training the model
model.fit(X_train, y_train)

# 8. Predicting and evaluating the model
pred = model.predict(X_test)
accuracy = accuracy_score(y_test, pred)

print("Logistic Model Accuracy:",accuracy)

# Counting predictions by class
prediction_counts = pd.Series(pred).value_counts().sort_index() #converts model predictions into a pandas Series.

# print("Prediction counts (0 = Denied, 1 = Paid, 2=Pending,3=Adjusted):")
print("Prediction counts by Claim Status:",prediction_counts)




Logistic Model Accuracy: 0.9994997498749375
Prediction counts by Claim Status: Adjusted      27
Denied       167
Paid          52
Pending     1753
Name: count, dtype: int64


In [None]:

import os
import pandas as pd

data = "/Users/pavithragunasekaran/Documents/sem_3/Dab311_Deep_Learning/venv/capstone_project/Final data.csv"
status_col = "Claim Status"
reasoncode = "Reason Code"
output_directory = "outputs"

def main():
    os.makedirs(output_directory, exist_ok=True) #creating a directory called output_directory to store all the seperated files.

    df = pd.read_csv(data)

    # Standardizing the claim status into lowercase and remove the spaces to make system to learn easily
    df["_status_norm"] = (
        df[status_col]
        .astype(str)
        .str.strip() #removes the extra spaces which is in the claim_status
        .str.lower()
    )

    #  Denied OR pending claims
    denied_pending_df = df[
        df["_status_norm"].str.contains("deny", na=False) | 
        df["_status_norm"].str.contains("pend", na=False)
    ]

    # save denied and pending data for all reason code in one folder called output directory and naming the file as "denied_and_pending_claims.csv"
    denied_pending_data = os.path.join(output_directory, "denied_and_pending_claims.csv")
    denied_pending_df.to_csv(denied_pending_data, index=False)

    # seperating the co-119 data only from the denied_and_pending_claims.csv file
    co119_df = denied_pending_df[
        denied_pending_df[reasoncode]
        .astype(str)
        .str.upper() #converts the reason codes to uppercase
        .str.replace(" ", "")#removes any spaces
        .isin(["CO-119", "CO119"]) #keeps only rows where the reason code exactly matches CO-119, CO119
    ]

 #save denied and pending data for CO119 reason code in one folder called output directory and naming the file as "co119_denied_pending_claims.csv"
    co119_data = os.path.join(output_directory, "co119_denied_pending_claims.csv")
    co119_df.to_csv(co119_data, index=False)

    print("Rows with Denied/Pending:", len(denied_pending_df))
    print("Rows with CO-119:", len(co119_df))
    print("Files saved in folder:", output_directory)

if __name__ == "__main__":
    main() #checks whether the Python file is run directly. if the file is imported into another notebook, main() will not run automatically, which prevents unintended execution.

Rows with Denied/Pending: 8770
Rows with CO-119: 5511
Files saved in folder: outputs
