In [11]:
import pandas as pd

# Load the dataset
df = pd.read_csv('insurance_claims_nlp_gpt_varied.csv')

df.head()

Unnamed: 0,id,text,fraud_reported
0,521585,"Expert Opinion:\n\nThe insured, a 48-year-old ...",Y
1,342868,"""Hey there! So, get this - a 42-year-old dude ...",Y
2,687698,"""As an insurance agent handling this claim, I ...",N
3,227811,1. **Formal Investigation Style:**\n\nOn Janua...,Y
4,367455,"""Well, well, well, looks like we've got a case...",N


In [12]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Clean and preprocess text data
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

def tokenize_and_process(text):
    """
    Tokenize text, remove stopwords, and apply stemming
    """
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and apply stemming
    processed_tokens = [
        stemmer.stem(token) for token in tokens 
        if token not in stop_words and len(token) > 2
    ]
    
    return processed_tokens

# Apply text preprocessing to the 'text' column
print("Cleaning text...")
df['cleaned_text'] = df['text'].apply(clean_text)

print("Tokenizing and processing...")
df['tokenized_text'] = df['cleaned_text'].apply(tokenize_and_process)

# Create a processed text column (tokens joined back into strings)
df['processed_text'] = df['tokenized_text'].apply(lambda x: ' '.join(x))

print("Text preprocessing completed!")
print(f"\nOriginal text sample:")
print(df['text'].iloc[0][:200] + "...")
print(f"\nCleaned text sample:")
print(df['cleaned_text'].iloc[0][:200] + "...")
print(f"\nTokenized text sample (first 20 tokens):")
print(df['tokenized_text'].iloc[0][:20])
print(f"\nProcessed text sample:")
print(df['processed_text'].iloc[0][:200] + "...")

[nltk_data] Downloading package punkt to /Users/aadarsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aadarsh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aadarsh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Cleaning text...
Tokenizing and processing...
Text preprocessing completed!

Original text sample:
Expert Opinion:

The insured, a 48-year-old male with a medical doctorate and a penchant for repairing crafts, experienced a significant single-vehicle collision on January 25, 2015. The incident occu...

Cleaned text sample:
expert opinion the insured a yearold male with a medical doctorate and a penchant for repairing crafts experienced a significant singlevehicle collision on january the incident occurred in columbus so...

Tokenized text sample (first 20 tokens):
['expert', 'opinion', 'insur', 'yearold', 'male', 'medic', 'doctor', 'penchant', 'repair', 'craft', 'experienc', 'signific', 'singlevehicl', 'collis', 'januari', 'incid', 'occur', 'columbu', 'south', 'carolina']

Processed text sample:
expert opinion insur yearold male medic doctor penchant repair craft experienc signific singlevehicl collis januari incid occur columbu south carolina drive around oclock morn collis type class

In [13]:
# Display statistics about the preprocessed text
print("Text Preprocessing Statistics:")
print("=" * 40)

# Original text statistics
original_lengths = df['text'].str.len()
print(f"Original text - Average length: {original_lengths.mean():.1f} characters")
print(f"Original text - Max length: {original_lengths.max()} characters")
print(f"Original text - Min length: {original_lengths.min()} characters")

# Processed text statistics
processed_lengths = df['processed_text'].str.len()
print(f"\nProcessed text - Average length: {processed_lengths.mean():.1f} characters")
print(f"Processed text - Max length: {processed_lengths.max()} characters")
print(f"Processed text - Min length: {processed_lengths.min()} characters")

# Token statistics
token_counts = df['tokenized_text'].apply(len)
print(f"\nTokens per document - Average: {token_counts.mean():.1f}")
print(f"Tokens per document - Max: {token_counts.max()}")
print(f"Tokens per document - Min: {token_counts.min()}")

# Vocabulary size
all_tokens = [token for tokens in df['tokenized_text'] for token in tokens]
unique_tokens = set(all_tokens)
print(f"\nVocabulary size: {len(unique_tokens)} unique tokens")
print(f"Total tokens: {len(all_tokens)}")

# Display the dataframe columns
print(f"\nDataframe shape: {df.shape}")
print(f"New columns added: {[col for col in df.columns if col not in ['id', 'text', 'fraud_reported']]}")

# Show a sample of the processed data
print(f"\nSample processed data:")
df[['text', 'processed_text', 'fraud_reported']].head(2)

Text Preprocessing Statistics:
Original text - Average length: 1076.9 characters
Original text - Max length: 1587 characters
Original text - Min length: 32 characters

Processed text - Average length: 588.8 characters
Processed text - Max length: 938 characters
Processed text - Min length: 12 characters

Tokens per document - Average: 91.5
Tokens per document - Max: 135
Tokens per document - Min: 2

Vocabulary size: 2219 unique tokens
Total tokens: 91549

Dataframe shape: (1000, 6)
New columns added: ['cleaned_text', 'tokenized_text', 'processed_text']

Sample processed data:


Unnamed: 0,text,processed_text,fraud_reported
0,"Expert Opinion:\n\nThe insured, a 48-year-old ...",expert opinion insur yearold male medic doctor...,Y
1,"""Hey there! So, get this - a 42-year-old dude ...",hey get yearold dude who custom month merced n...,Y


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-small-en-v1.5")

embeddings = model.encode(df['processed_text'].tolist(), convert_to_tensor=False)

df['bert_embeddings'] = [row for row in embeddings]