In [2]:
import pandas as pd
import pickle

In [3]:
data = pd.read_csv("./subset_df_2.csv")
data.head()

Unnamed: 0,Company,File No.,Opened,Closed,Coverage,SubCoverage,Reason,SubReason,Disposition,Conclusion,Recovery,Status
0,"Oxford Health Plans (CT), Inc",7054984,06/08/2023,06/13/2023,Group,Health Only,Claim Handling,Medical Necessity,Company Position Substantiated,Coverage Denied,0.0,Closed
1,"ConnectiCare Benefits, Inc.",7046842,07/27/2022,08/31/2022,A & H,Exchange,Claim Handling,Unsatisfactory Settlement,Company Position Substantiated,Company Position Upheld,0.0,Closed
2,ReliaStar Life Insurance Company,7056274,08/01/2023,09/06/2023,Individual Annuities,Fixed,Claim Handling,Prompt Pay,Company Position Substantiated,Furnished Information,0.0,Closed
3,"Anthem Health Plans, Inc",7045021,05/09/2022,06/09/2022,Group,A & H,Claim Handling,UR MEDICALLY NECESSARY DENIAL,Company Position Substantiated,External Review Info Sent,0.0,Closed
4,Underwriters at Lloyds London,7019177,05/13/2019,06/14/2019,Commercial Multi-Peril,Commercial Fire,Claim Handling,Unsatisfactory Settlement/Offer,Company Position Substantiated,Refer To Appraisal,0.0,Closed


# Transform Categorical Columns

In [4]:
import pandas as pd
# Convert 'Company', 'Coverage', 'SubCoverage', 'Reason', 'SubReason', 'Disposition', 'Conclusion', 'Status' to categorical
for column in ['Company', 'Coverage', 'SubCoverage', 'Reason', 'SubReason', 'Disposition', 'Conclusion', 'Status']:
  data[column] = pd.Categorical(data[column])

# Convert 'Opened' and 'Closed' to datetime
data['Opened'] = pd.to_datetime(data['Opened'], errors='coerce')
data['Closed'] = pd.to_datetime(data['Closed'], errors='coerce')

# Convert 'Recovery' to float
data['Recovery'] = pd.to_numeric(data['Recovery'], errors='coerce')

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Company      180 non-null    category      
 1   File No.     180 non-null    int64         
 2   Opened       180 non-null    datetime64[ns]
 3   Closed       180 non-null    datetime64[ns]
 4   Coverage     180 non-null    category      
 5   SubCoverage  180 non-null    category      
 6   Reason       180 non-null    category      
 7   SubReason    180 non-null    category      
 8   Disposition  180 non-null    category      
 9   Conclusion   180 non-null    category      
 10  Recovery     180 non-null    float64       
 11  Status       180 non-null    category      
dtypes: category(8), datetime64[ns](2), float64(1), int64(1)
memory usage: 12.3 KB


# Curate Data in Query and Response structure

More Question and response can be added to make chat result better

In [5]:
claim_data = data.copy()

# Generate query-response pairs
query_response_data = []

for idx, row in claim_data.iterrows():
    # Query 1: Claim status
    query_response_data.append({
        'query': f"What is the status of my claim with file number {row['File No.']}?",
        'response': f"Your claim with File No. {row['File No.']} is currently {row['Status']}."
    })

    # # Query 2: Recovery amount
    # query_response_data.append({
    #     'query': f"What was the recovery amount for claim number {row['File No.']}?",
    #     'response': f"The recovery amount for your claim with File No. {row['File No.']} is ${row['Recovery']}."
    # })

    # Query 3: Denial reason (if applicable)
    if row['Disposition'] == 'Claim Settled':
        query_response_data.append({
            'query': f"Why was my claim with file number {row['File No.']} Claim Settled?",
            'response': f"Your claim with File No. {row['File No.']} was Settled on  {row['Closed']}."
        })

    # Query 4: Claim open and close dates
    query_response_data.append({
        'query': f"When was my claim with file number {row['File No.']} opened and closed?",
        'response': f"Your claim with File No. {row['File No.']} was opened on {row['Opened']} and closed on {row['Closed']}."
    })

    # # Query 5: Disposition reason
    # query_response_data.append({
    #     'query': f"Why is my claim with file number {row['File No.']} {row['Disposition']}?",
    #     'response': f"Your claim with File No. {row['File No.']} was {row['Disposition']} due to {row['SubReason']}."
    # })



# Saving query_response file for future use

In [6]:
# Save the trained vectorizer as a pickle file
with open('query_response_data.pkl', 'wb') as f:
    pickle.dump(query_response_data, f)

# Convert to DataFrame
query_response_df = pd.DataFrame(query_response_data)
query_response_df.head()


Unnamed: 0,query,response
0,What is the status of my claim with file numbe...,Your claim with File No. 7054984 is currently ...
1,When was my claim with file number 7054984 ope...,Your claim with File No. 7054984 was opened on...
2,What is the status of my claim with file numbe...,Your claim with File No. 7046842 is currently ...
3,When was my claim with file number 7046842 ope...,Your claim with File No. 7046842 was opened on...
4,What is the status of my claim with file numbe...,Your claim with File No. 7056274 is currently ...


# Load Model

In [7]:
from transformers import GPTNeoForCausalLM, AutoTokenizer
import torch
import pandas as pd

# Load the dataset
data = query_response_data # You can use pickle file as well


# Load GPT-Neo model and tokenizer
model_name = "EleutherAI/gpt-neo-125M"  # You can choose a larger model like gpt-neo-1.3B if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)


# Few Shot Learning

In [8]:
df = query_response_df
# Create a prompt with a few examples for in-context learning
def generate_prompt(user_input):
    examples = ""
    for i in range(min(len(df), 3)):  # Include 3 examples from the dataframe
        examples += f"Q: {df['query'].iloc[i]}\nA: {df['response'].iloc[i]}\n\n"

    # Combine examples with the user's question
    prompt = examples + f"Q: {user_input}\nA:"
    return prompt

# Function to generate response using GPT-Neo
def gpt_neo_response(user_input):
    prompt = generate_prompt(user_input)
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)

    # Generate response
    output = model.generate(
        **inputs,
        max_new_tokens=50,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the generated response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract only the answer from the response (after the user's query)
    response = response.split("A:")[-1].strip()
    return response


# Inference for GPT Neo - One Shot Learning

In [11]:
user_input = "What is status of file 7066579"
response = gpt_neo_response(user_input)
response

'Your claim with File No. 7066579 is currently Closed.'

# Training on Custom Data

#!pip install transformers datasets

In [12]:
listofQuestion = query_response_data

# Convert list of Query Response to text file

In [13]:
# Convert data to the desired text format
with open("query_response_dataset.txt", "w") as file:
    for item in listofQuestion:
        query = item['query']
        response = item['response']
        file.write(f"Q: {query}\nA: {response}\n<|endoftext|>\n")

# Read text file

In [14]:
def read_sentences(filename, num_sentences=3):
    with open(filename, 'r') as file:
        text = file.read()
    
    # Split the text into sentences (assuming sentences end with a period)
    sentences = text.split('. ')
    
    # Return the first 'num_sentences' sentences
    return '. '.join(sentences[:num_sentences]) + '.'



In [15]:
# Example usage
filename = 'query_response_dataset.txt'
sentences = read_sentences(filename, num_sentences=5)  # Read first 5 sentences
print(sentences)


Q: What is the status of my claim with file number 7054984?
A: Your claim with File No. 7054984 is currently Closed.
<|endoftext|>
Q: When was my claim with file number 7054984 opened and closed?
A: Your claim with File No. 7054984 was opened on 2023-06-08 00:00:00 and closed on 2023-06-13 00:00:00.
<|endoftext|>
Q: What is the status of my claim with file number 7046842?
A: Your claim with File No. 7046842 is currently Closed.
<|endoftext|>
Q: When was my claim with file number 7046842 opened and closed?
A: Your claim with File No. 7046842 was opened on 2022-07-27 00:00:00 and closed on 2022-08-31 00:00:00.
<|endoftext|>
Q: What is the status of my claim with file number 7056274?
A: Your claim with File No.


# Prepare Data for Training

In [None]:
import torch
from transformers import GPTNeoForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Load the tokenizer and model
model_name = "EleutherAI/gpt-neo-125M"  # Change to a larger model if needed (e.g., gpt-neo-1.3B)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)

# Function to load and tokenize dataset
def load_dataset(file_path, tokenizer, block_size=512):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

# Prepare data collator
def prepare_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=False  # GPT-Neo does causal (autoregressive) language modeling, so MLM is False
    )

# Load the dataset (assuming your data is in a text file)
train_dataset = load_dataset("./query_response_dataset.txt", tokenizer)

# Prepare the data collator
data_collator = prepare_collator(tokenizer)


# Training the model and Saving

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt_neo_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,          # You can adjust the number of epochs
    per_device_train_batch_size=2,
    save_steps=500,              # Save checkpoint every 500 steps
    save_total_limit=2,          # Only keep 2 last checkpoints
    logging_dir="./logs",        # Directory for logging
    logging_steps=10,            # Log every 10 steps
    learning_rate=5e-5,          # Fine-tuning learning rate
    weight_decay=0.01,           # Weight decay for regularization
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# Start the fine-tuning process
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("./gpt_neo_finetuned_model")
tokenizer.save_pretrained("./gpt_neo_finetuned_tokenizer")


# Load Pretrained Model

In [16]:
# Load the fine-tuned model and tokenizer
model_path = "./gpt_neo_finetuned_model"  # Path to your fine-tuned model
tokenizer_path = "./gpt_neo_finetuned_tokenizer"  # Path to your fine-tuned tokenizer

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = GPTNeoForCausalLM.from_pretrained(model_path)

# Building Inference

In [17]:
# Set the padding token to eos_token (or add a new pad_token if desired)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token
    # Alternatively, you can define a new padding token
    # tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Ensure the model knows about the new special token if one was added
model.resize_token_embeddings(len(tokenizer))

# Function to generate a prompt for the model
def generate_prompt(user_input):
    # Customize the prompt to follow your query-response format
    prompt = f"Q: {user_input}\nA:"
    return prompt

# Function to check if response is valid or needs fallback
def is_valid_response(response):
    # Check if the response is long enough and doesn't just repeat the question
    min_length = 10  # Minimum length for a valid response
    if len(response) < min_length:
        return False
    
    # You can add other heuristics, e.g., if response contains certain phrases, return False
    if "I don't know" in response or response.strip() == "":
        return False
    
    return True


# Function to generate response using the fine-tuned GPT-Neo model
def gpt_neo_response(user_input, max_new_tokens=50, temperature=0.7, top_p=0.9, top_k=50):
    # Prepare the prompt
    prompt = generate_prompt(user_input)
    
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True, padding=True)
    
    # Generate response
    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,  # Control how many tokens are generated
        do_sample=True,                 # Use sampling for diverse responses
        temperature=temperature,        # Control randomness of the output
        top_p=top_p,                    # Nucleus sampling to focus on the top probability mass
        top_k=top_k,                    # Limits sampling to top-k tokens
        pad_token_id=tokenizer.pad_token_id,  # Use the defined padding token
        #attention_mask=inputs['attention_mask']
    )
    
    # Decode the generated tokens into text
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract the relevant answer (anything after "A:")
    if "A:" in response:
        response = response.split("A:")[-1].strip()
    
    # Check if the response is valid, otherwise return a fallback message
    if not is_valid_response(response):
        response = "I'm sorry, I didn't understand that. Could you please rephrase your question?"


    return response


In [18]:
# Example usage
user_input = "What is the status of my claim with file number 7057039?"
response = gpt_neo_response(user_input)
print(response)

Your claim with File No. 7057039 is currently Reopened.
