In [2]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV file into a DataFrame
df = pd.read_csv('incident_event_log.csv')

# Display the first few rows to understand the structure
print(df.head())

# Define a function to compute BERT embeddings for sentences
def get_bert_embeddings(text_list):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Use the [CLS] token embeddings as the sentence embedding
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

# Extract text from the relevant column (example: 'Description')
texts = df.tolist()

# Compute BERT embeddings for each row's text
embeddings = get_bert_embeddings(texts)

# Define a query to rank rows (example: 'important sales data')
query = "important sales data"
query_embedding = get_bert_embeddings([query])[0]

# Compute cosine similarity between the query embedding and row embeddings
similarities = cosine_similarity([query_embedding], embeddings).flatten()

# Add similarities to the DataFrame and sort by similarity
df['similarity'] = similarities
summary_df = df.sort_values(by='similarity', ascending=False).head(5)

# Display the summarized data
print("Summary of Most Relevant Records:")
print(summary_df)

# Save the summary to a new CSV file
summary_df.to_csv('summary_most_relevant_records.csv', index=False)


  from .autonotebook import tqdm as notebook_tqdm


       number incident_state  active  reassignment_count  reopen_count  \
0  INC0000045            New    True                   0             0   
1  INC0000045       Resolved    True                   0             0   
2  INC0000045       Resolved    True                   0             0   
3  INC0000045         Closed   False                   0             0   
4  INC0000047            New    True                   0             0   

   sys_mod_count  made_sla    caller_id       opened_by        opened_at  ...  \
0              0      True  Caller 2403    Opened by  8  29/2/2016 01:16  ...   
1              2      True  Caller 2403    Opened by  8  29/2/2016 01:16  ...   
2              3      True  Caller 2403    Opened by  8  29/2/2016 01:16  ...   
3              4      True  Caller 2403    Opened by  8  29/2/2016 01:16  ...   
4              0      True  Caller 2403  Opened by  397  29/2/2016 04:40  ...   

  u_priority_confirmation         notify problem_id rfc vendor cause

AttributeError: 'DataFrame' object has no attribute 'tolist'