## Basic setting

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

In [None]:
%%time
#Everyone needs to save the dataset in their own drive.
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/
CPU times: user 1.61 s, sys: 345 ms, total: 1.95 s
Wall time: 32.9 s


In [None]:
%%time
complaints = pd.read_csv('/content/gdrive/MyDrive/complaints.csv')

CPU times: user 30.6 s, sys: 3.35 s, total: 34 s
Wall time: 52 s


In [None]:
# Drop rows where 'Consumer complaint narrative' is null
complaints = complaints.dropna(subset=['Consumer complaint narrative'])

In [None]:
complaints.isnull().sum()

Date received                         0
Product                               0
Sub-product                       52207
Issue                                 0
Sub-issue                        213727
Consumer complaint narrative          0
Company public response          708410
Company                               0
State                              5488
ZIP code                              0
Tags                            1258107
Consumer consent provided?            0
Submitted via                         0
Date sent to company                  0
Company response to consumer          2
Timely response?                      0
Consumer disputed?              1296092
Complaint ID                          0
dtype: int64

In [None]:
complaints_revised = complaints[['Company', 'Consumer complaint narrative']]

In [None]:
#Choose five banks for sample
company_list = ['WELLS FARGO & COMPANY', 'BANK OF AMERICA, NATIONAL ASSOCIATION', 'CAPITAL ONE FINANCIAL CORPORATION', 'JPMORGAN CHASE $ CO.', 'CITIBANK, N.A.']
complaints_revised = complaints_revised[complaints_revised['Company'].isin(company_list)]
complaints_revised.head(10)

Unnamed: 0,Company,Consumer complaint narrative
79,"CITIBANK, N.A.",I am disputing a charge of {$2100.00} to my XX...
91,"BANK OF AMERICA, NATIONAL ASSOCIATION",I receive grants from the us government of tre...
95,WELLS FARGO & COMPANY,I was called and emailed a letter from XXXX XX...
263,WELLS FARGO & COMPANY,I found a home online through XXXX and rented ...
266,"BANK OF AMERICA, NATIONAL ASSOCIATION",on XX/XX/2023 i deposited a check into my acco...
275,"BANK OF AMERICA, NATIONAL ASSOCIATION",On XX/XX/2023 received the following emails ad...
293,WELLS FARGO & COMPANY,"At XXXX on today, XX/XX/2023, I received an em..."
297,"BANK OF AMERICA, NATIONAL ASSOCIATION",I received an direct deposit in my account. Af...
315,"BANK OF AMERICA, NATIONAL ASSOCIATION",I purchased an officer chair in XXXX the compa...
317,WELLS FARGO & COMPANY,On XX/XX/ at XXXX XXXX XXXX XXXX the consumer ...


In [None]:
complaints_revised.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 141043 entries, 79 to 4026929
Data columns (total 2 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Company                       141043 non-null  object
 1   Consumer complaint narrative  141043 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


In [None]:
import random

# Specify the sample size you want - 50
sample_size = 50

# Extract the 'Consumer complaint narrative' column as a list
narratives = complaints_revised['Consumer complaint narrative'].tolist()

# Check if the number of narratives is greater than the desired sample size
if len(narratives) > sample_size:
    random_samples = random.sample(narratives, sample_size)
else:
    random_samples = narratives

# Create a DataFrame from the sampled narratives
sample = pd.DataFrame({'Consumer complaint narrative': random_samples})

In [None]:
sample.head(10)

Unnamed: 0,Consumer complaint narrative
0,Citibank XXXX brooks brothers XXXX XXXX XXXX r...
1,I Contacted the bank a few months back about a...
2,We in the process of contacting the credit bur...
3,This complaint concerns the XXXX XXXX XXXX cre...
4,"To whom it may concern, I am writing in respon..."
5,I applied for a Citigold checking account with...
6,This card was open XXXX or XXXX the name is ca...
7,"Hi, On XX/XX/2017 Bank of America send a wire ..."
8,I have a checking account with wells Fargo. Du...
9,Last year I was notified that I had a debt fro...


In [None]:
sample.isnull().sum()

Consumer complaint narrative    0
dtype: int64

# 0_Setup
#### Load the API key and relevant Python libaries.

In [None]:
OPENAI_API_KEY = 'sk-***'
# each person has to use own api key

In [None]:
!pip install openai

Collecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.[0m[31m
[0mSuccessfully installed openai-0.28.1


In [None]:
import openai
import os
import time

openai.api_key  = OPENAI_API_KEY

#### helper function
we will use OpenAI's `gpt-3.5-turbo` model and the [chat completions endpoint](https://platform.openai.com/docs/guides/chat).

This helper function will make it easier to use prompts and look at the generated outputs:

In [None]:
def get_completion(prompt, model="gpt-3.5-turbo", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [None]:
# add error, retry
def get_completion(prompt, model="gpt-3.5-turbo", temperature=0, verbose=False):
    messages = [{"role": "user", "content": prompt}]

    time_start = time.time()
    retry_count = 3
    for i in range(0, retry_count):
        while True:
            try:
                response = openai.ChatCompletion.create(
                    model=model,
                    messages=messages,
                    temperature=temperature, # this is the degree of randomness of the model's output
                )
                answer = response['choices'][0]['message']['content'].strip()
                tokens = response.usage.total_tokens


                time_end = time.time()

                if verbose:
                    print('prompt: %s | token: %d | %.1fsec\nanwer : %s'%(prompt, tokens, (time_end - time_start), answer))
                return answer

            except Exception as error:
                print(f"API Error: {error}")
                print(f"Retrying {i+1} time(s) in 4 seconds...")

                if i+1 == retry_count:
                    return prompt, None, None
                time.sleep(4)
                continue

In [None]:
prompt = 'Please explain what ChatGPT in 20 words'
answer = get_completion(prompt, model="gpt-3.5-turbo")
print(answer)

ChatGPT is an AI language model developed by OpenAI that can engage in conversational interactions and provide responses to user inputs.


# 1_Guidelines for Prompting
two prompting principles and their related tactics in order to write effective prompts for large language models.

## Prompting Principles
- **Principle 1: Write clear and specific instructions**  
- **Principle 2: Give the model time to “think”**

### Tactics

#### Tactic 1: Use delimiters to clearly indicate distinct parts of the input,  
- Delimiters can be anything like: ```, """, < >, `<tag> </tag>`, `:`,  

In [None]:
text = f"""
You should express what you want a model to do by \
providing instructions that are as clear and \
specific as you can possibly make them. \
This will guide the model towards the desired output, \
and reduce the chances of receiving irrelevant \
or incorrect responses. Don't confuse writing a \
clear prompt with writing a short prompt. \
In many cases, longer prompts provide more clarity \
and context for the model, which can lead to \
more detailed and relevant outputs.
"""
prompt = f"""
Summarize the text delimited by triple backticks \
into a single sentence.
```{text}```
"""
response = get_completion(prompt)
print(response)

To guide a model towards the desired output and minimize irrelevant or incorrect responses, it is important to provide clear and specific instructions, even if it means writing longer prompts that offer more clarity and context.


### Principle 2: Give the model time to “think”

#### Tactic 1: Specify the steps required to complete a task

In [None]:
text = f"""
In a charming village, siblings Jack and Jill set out on \
a quest to fetch water from a hilltop \
well. As they climbed, singing joyfully, misfortune \
struck—Jack tripped on a stone and tumbled \
down the hill, with Jill following suit. \
Though slightly battered, the pair returned home to \
comforting embraces. Despite the mishap, \
their adventurous spirits remained undimmed, and they \
continued exploring with delight.
"""
# example 1
prompt_1 = f"""
Perform the following actions:
1 - Summarize the following text delimited by triple \
backticks with 1 sentence.
2 - Translate the summary into French.
3 - List each name in the French summary.
4 - Output a json object that contains the following \
keys: french_summary, num_names.

Separate your answers with line breaks.

Text:
```{text}```
"""
response = get_completion(prompt_1)
print("Completion for prompt 1:")
print(response)

Completion for prompt 1:
1 - Jack and Jill, siblings, go on a quest to fetch water from a well on a hill, but they both fall down the hill and return home slightly injured but still adventurous.

2 - Jack et Jill, frère et sœur, partent à la recherche d'eau d'un puits situé au sommet d'une colline, mais ils tombent tous les deux et rentrent chez eux légèrement blessés mais toujours aventureux.

3 - Jack, Jill.

4 - {
  "french_summary": "Jack et Jill, frère et sœur, partent à la recherche d'eau d'un puits situé au sommet d'une colline, mais ils tombent tous les deux et rentrent chez eux légèrement blessés mais toujours aventureux.",
  "num_names": 2
}


## Synthesize Complaints



* The initial exploratory analysis identified three types of complaints:
 * Complaints that are generally well written - DESIRABLE
 * Complaints with very poor grammar and punctuation - PROBLEMATIC
 * Complaints with frivolous, unrelated information - PROBLEMATIC



Based on this, we had this idea: Have we focus on just the invariance and robustness aspects related to these “problematic” complaints and focus on questions like “Can the right LLM with the right prompt ‘clean up’ these problematic complaints?” In other words, can an LLM correct the poor grammar and punctuation without changing the meaning (invariance) and can an LLM identify the frivolous content of a complaint (robustness)?





Sample Complaints - Lets see how it looks like

In [None]:
random_samples[:5]

 "I Contacted the bank a few months back about a debit transaction for an unauthorized amount over XXXX for a Jewelry Store. I don't even wear jewelry!!!! The name of the Merchant was XXXX XXXX or something of that sort. I asked that the bank investigate this and return my funs. The bank once again after all the negative press this bank gets for fraud and scams did not resolve the issue. Basically Told me I was the one who authorized this and I did not. I asked what were the determining factors and the rep started to stutter and of course made up a lie that they would send the details of the decision in the mail. At this point I want my money back from this joke of a bank. Ive been a customer since XX/XX/2009 and have over XXXX sitting in that account plus more at another institution. Im going to blast this bank all over the net and make sure people know not to do business with them and that their money is not safe here. The bank closed and reissued the card for fraud. But won't charge

Prompt for Robustness Testing

In [None]:
for i in range(len(random_samples[0:5])):
    prompt = f"""
    Your task is to examine if customer \
    complaints from banks are well written without any issue.

    Determine if complaints are well-written (considered desirable), \
    exhibit severe grammar or punctuation issues (considered problematic), \
    or contain irrelevant, frivolous information (also considered problematic). \

    Make your response as short as possible.

    Complaints: ```{random_samples[i]}```
    """

    response = get_completion(prompt)
    print(i, response, "\n")

0 The complaint contains severe grammar and punctuation issues. 

1 Problematic. 

2 Well-written. 

3 The complaint is well-written. 

4 The complaint is well-written. 



In [None]:
 text= f"""I have two recent inquires one from XX/XX/2021 and one from XX/XX/2021 from a place called CBNA. This was not done nor authorized by me. When I XXXX this place, its states this place is in New York, I'm in XXXX XXXX, there's nothing this company can do for me. Also, when I called XXXX when I noticed the first inquiry they said this was a credit card inquiry. I have a bank as well as a bank card, if I wanted a credit card I would just go through my bank, I wouldn't even go through the trouble of going somewhere else, let alone applying twice within 2 weeks of the first time. That's just nonsense. A few years ago I filed a police report for identity thief and if needed I can do it again for this. I have been working really hard on my credit and the last thing I need is false / fraudulent information on my report. I would've contacted CBNA but honestly I don't even know where to start, or what to say. What do you say to a company you've never spoke too? However, I will do what needs to be done to get this removed."""
 prompt = f"""
    Your task is to examine if customer \
    complaints from banks are well written without any issue.

    Determine if complaints are well-written (considered desirable), \
    exhibit severe grammar or punctuation issues (considered problematic), \
    or contain irrelevant, frivolous information (also considered problematic). \

    Make your response as short as possible.

    Complaints: ```{text}```
    """

response = get_completion(prompt)
print(response)

Problematic.


As we can see some complaints have issues related to grammar. Let's see if LLM can change complaints look better.

Prompt for Equal Complaint Synthesis (MFT)

In [None]:
for i in range(len(random_samples[0:5])):
    prompt = f"""
    Your task is to generate a short synthesized customer \
    complaints from banks.

    Generate synthesized complaints by rearranging words and sentence structures \
    while maintaining the same meaning and intensity as the original \
    complaints in at most 50 words.

    Focus on maintaining equality in the sentiment. \

    Complaints: ```{random_samples[i]}```
    """

    response = get_completion(prompt)
    print(i, response, "\n")


1 A few months ago, I contacted the bank regarding an unauthorized debit transaction at a Jewelry Store for an amount over XXXX. I don't even wear jewelry! The Merchant's name was XXXX XXXX or something similar. Despite the bank's negative reputation for fraud and scams, they failed to resolve the issue. They claimed I authorized the transaction, but couldn't provide any valid reasons. Now, I demand my money back from this unreliable bank. I've been a customer since XX/XX/2009, with a substantial amount in my account. I will expose this bank's incompetence online and warn others about the lack of security for their money. Although the bank closed and reissued my card due to fraud, they refuse to refund my funds. This is absolutely absurd! 

2 We are currently in the process of reaching out to the credit bureaus to conduct an investigation into the inquiries. 

3 I am extremely dissatisfied with the Bank of America's XXXX XXXX XXXX credit card. Despite earning miles for purchases, I ha

In [None]:
text= f"""I have two recent inquires one from XX/XX/2021 and one from XX/XX/2021 from a place called CBNA. This was not done nor authorized by me. When I XXXX this place, its states this place is in New York, I'm in XXXX XXXX, there's nothing this company can do for me. Also, when I called XXXX when I noticed the first inquiry they said this was a credit card inquiry. I have a bank as well as a bank card, if I wanted a credit card I would just go through my bank, I wouldn't even go through the trouble of going somewhere else, let alone applying twice within 2 weeks of the first time. That's just nonsense. A few years ago I filed a police report for identity thief and if needed I can do it again for this. I have been working really hard on my credit and the last thing I need is false / fraudulent information on my report. I would've contacted CBNA but honestly I don't even know where to start, or what to say. What do you say to a company you've never spoke too? However, I will do what needs to be done to get this removed."""
prompt = f"""
    Your task is to generate a short synthesized customer \
    complaints from banks.

    Generate synthesized complaints by rearranging words and sentence structures \
    while maintaining the same meaning and intensity as the original \
    complaints in at most 50 words.

    Focus on maintaining equality in the sentiment. \


    Make your response as short as possible.

    Complaints: ```{text}```
    """

response = get_completion(prompt)
print(response)

I received unauthorized inquiries from CBNA on XX/XX/2021 and XX/XX/2021. They claim to be in New York, but I'm in XXXX XXXX. I already have a bank and a bank card, so I don't need a credit card. I filed a police report for identity theft before and will do it again if necessary. I don't know how to contact CBNA, but I'll take action to remove this.


Prompt for Invariance Complaint Synthesis

In [None]:
for i in range(len(random_samples[0:5])):
    prompt = f"""
    Your task is to generate a short synthesized customer \
    complaints from banks.

    Create synthetic complaints that retain the core message \
    and sentiment of the original complaints but change some words \
    or phrases while ensuring there is no change in the overall meaning \
    in at most 50 words.

    Complaints: ```{random_samples[i]}```
    """

    response = get_completion(prompt)
    print(i, response, "\n")


1 A few months ago, I reached out to the bank regarding an unauthorized debit transaction at a Jewelry Store. I never wear jewelry! The Merchant's name was something like XXXX XXXX. I requested the bank to investigate and refund my money. Despite the bank's reputation for fraud and scams, they failed to resolve the issue. They claimed I authorized the transaction, which is untrue. When I asked for evidence, the representative stuttered and promised to send the details by mail. I'm fed up with this bank and want my money back. I've been a customer since XX/XX/2009 and have a substantial amount in my account. I'll expose this bank online to warn others about their lack of security. Although the bank closed and reissued my card due to fraud, they refuse to refund my money. This is absurd. 

2 We are currently reaching out to the credit agencies to examine the queries. 

3 I have an issue with the Bank of America credit card, specifically the XXXX XXXX XXXX. It promises miles for purchase

Prompt for Harshness Modification (Direction):

In [None]:
for i in range(len(random_samples[0:5])):
    prompt = f"""
    Your task is to generate a short synthesized customer \
    complaints from banks.

    Develop a method to make complaints more harsh or less harsh \
    while keeping the underlying issue intact in at most 50 words\

    Use a range of intensity levels (Less Harsh or More harsh)) to ensure diversity.

    Complaints: ```{random_samples[i]}```
    """

    response = get_completion(prompt)
    print(i, response, "\n")


More Harsh: Citibank abruptly slashed my credit limit without any prior notice, pushing my account to 99% utilization rate. Despite my impeccable payment record, they failed to inform me about this change. The customer service representative was rude, dismissive, and even hung up on me when I questioned their lack of communication. This is a clear violation of the fair credit reporting act. 

1 Less Harsh: I contacted the bank a few months ago regarding an unauthorized debit transaction for a Jewelry Store. Despite being a loyal customer since XX/XX/2009, the bank failed to resolve the issue and claimed I authorized the transaction. I am disappointed and concerned about the security of my funds.

More Harsh: I contacted the bank months ago about an unauthorized debit transaction for a Jewelry Store. The bank, known for fraud and scams, once again failed to resolve the issue. Their representative stuttered and lied about sending details. I demand my money back from this joke of a bank 

In [None]:
text= f"""I have two recent inquires one from XX/XX/2021 and one from XX/XX/2021 from a place called CBNA. This was not done nor authorized by me. When I XXXX this place, its states this place is in New York, I'm in XXXX XXXX, there's nothing this company can do for me. Also, when I called XXXX when I noticed the first inquiry they said this was a credit card inquiry. I have a bank as well as a bank card, if I wanted a credit card I would just go through my bank, I wouldn't even go through the trouble of going somewhere else, let alone applying twice within 2 weeks of the first time. That's just nonsense. A few years ago I filed a police report for identity thief and if needed I can do it again for this. I have been working really hard on my credit and the last thing I need is false / fraudulent information on my report. I would've contacted CBNA but honestly I don't even know where to start, or what to say. What do you say to a company you've never spoke too? However, I will do what needs to be done to get this removed."""
prompt =  f"""
    Your task is to generate a short synthesized customer \
    complaints from banks.

    Develop a method to make complaints more harsh\
    while keeping the underlying issue intact in at most 50 words\

    Use a range of intensity levels to ensure diversity.


    Make your response as short as possible.

    Complaints: ```{text}```
    """

response = get_completion(prompt)
print(response)

This unauthorized inquiry from CBNA on XX/XX/2021 is unacceptable. I have diligently worked on my credit, and false information like this is detrimental. I will not hesitate to take legal action against identity theft. Remove this immediately.


In [None]:
text= f"""I have two recent inquires one from XX/XX/2021 and one from XX/XX/2021 from a place called CBNA. This was not done nor authorized by me. When I XXXX this place, its states this place is in New York, I'm in XXXX XXXX, there's nothing this company can do for me. Also, when I called XXXX when I noticed the first inquiry they said this was a credit card inquiry. I have a bank as well as a bank card, if I wanted a credit card I would just go through my bank, I wouldn't even go through the trouble of going somewhere else, let alone applying twice within 2 weeks of the first time. That's just nonsense. A few years ago I filed a police report for identity thief and if needed I can do it again for this. I have been working really hard on my credit and the last thing I need is false / fraudulent information on my report. I would've contacted CBNA but honestly I don't even know where to start, or what to say. What do you say to a company you've never spoke too? However, I will do what needs to be done to get this removed."""
prompt =  f"""
    Your task is to generate a short synthesized customer \
    complaints from banks.

    Develop a method to make complaints less harsh\
    while keeping the underlying issue intact in at most 50 words\

    Use a range of intensity levels to ensure diversity.


    Make your response as short as possible.

    Complaints: ```{text}```
    """

response = get_completion(prompt)
print(response)

I recently noticed two unauthorized inquiries on my credit report from CBNA on XX/XX/2021 and XX/XX/2021. I'm concerned about false information affecting my hard-earned credit. Unsure how to approach CBNA, but determined to resolve this issue promptly.
