In [None]:
import os
from dotenv import load_dotenv, dotenv_values 
import getpass
from groq import Groq
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
##Load the .env file
load_dotenv() 
#print(os.getenv("OpenSource_API_KEY")

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)
def extract_text_from_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        return "The file was not found."
    except Exception as e:
        return f"An error occurred: {e}"

file_path = '../clearlea_challenge/Documents_Generated/rental_contract1.txt'
text = extract_text_from_file(file_path)
text_for_input = f"Summarize this contract and extract the most important details in short.: \n{text}"
# print(f"Summarize this contract: {text}")
# print(text_for_input)

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": text_for_input,
        }
    ],
    model="mixtral-8x7b-32768",
)
summary1 = chat_completion.choices[0].message.content
print(chat_completion.choices[0].message.content)

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": text_for_input,
        }
    ],
    model="llama3-70b-8192",
)
summary2 = chat_completion.choices[0].message.content
print(chat_completion.choices[0].message.content)

In [None]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize, remove stop words and lowercase the text
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(tokens)

def compare_summaries(summary1, summary2):
    # Preprocess the summaries
    processed_summary1 = preprocess_text(summary1)
    processed_summary2 = preprocess_text(summary2)
    
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([processed_summary1, processed_summary2])
    
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]).flatten()[0]
    
    # Tokenize the original summaries for difference analysis
    tokens1 = set(nltk.word_tokenize(summary1.lower()))
    tokens2 = set(nltk.word_tokenize(summary2.lower()))
    
    # Calculate the differences
    unique_to_summary1 = tokens1 - tokens2
    unique_to_summary2 = tokens2 - tokens1
    
    # Return a comparative analysis
    return {
        'cosine_similarity': cosine_sim,
        'unique_to_summary1': list(unique_to_summary1),
        'unique_to_summary2': list(unique_to_summary2)
    }

result = compare_summaries(summary1, summary2)
print(result)