In [13]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [14]:
# Importing necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
from fuzzywuzzy import fuzz



In [15]:
# Mount Google Drive to access the file
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read the CSV file from Google Drive
# Update the file path according to where your CSV is stored in Google Drive
file_path = '/content/drive/MyDrive/Colab/synthetic_text_full.csv'
df = pd.read_csv(file_path)

In [16]:
# Display the first few rows of the dataframe
df.head()

# Assuming the column names are 'generated_text' and 'synthetic_text'
# We will calculate TF-IDF for both columns and then compute cosine similarity

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Combine the two text columns into one list (you can adjust if needed)
# texts = df['generated_text'].tolist() + df['synthetic_text'].tolist()

In [17]:
def compute_cosine_similarity(text1, text2):
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Function to compute fuzzy match score (percentage similarity) between two text strings
def compute_fuzzy_similarity(text1, text2):
    return fuzz.ratio(text1, text2) / 100.0  # Normalizing the score to [0, 1]

# Initialize a list to store similarity scores
similarity_scores = []

# Define the weights for the cosine similarity and fuzzy matching
cosine_weight = 0.7  # Weight for cosine similarity (TF-IDF)
fuzzy_weight = 0.3   # Weight for fuzzy matching

In [18]:
for i, row in df.iterrows():
    generated_text = row['generated_text']
    synthetic_text = row['synthetic_text']

    # Calculate the cosine similarity (TF-IDF)
    cosine_sim = compute_cosine_similarity(generated_text, synthetic_text)

    # Calculate the fuzzy similarity score
    fuzzy_sim = compute_fuzzy_similarity(generated_text, synthetic_text)

    # Compute the weighted average similarity score
    weighted_similarity = (cosine_weight * cosine_sim) + (fuzzy_weight * fuzzy_sim)

    # Append the weighted similarity score to the list
    similarity_scores.append(weighted_similarity)


In [19]:
# Convert the list of similarity scores into a DataFrame or a Series
similarity_scores_df = pd.DataFrame({
    'generated_text': df['generated_text'],
    'synthetic_text': df['synthetic_text'],
    'similarity_score': similarity_scores
})

# Display the resulting dataframe with similarity scores
similarity_scores_df.head()

# Optionally, save the output to a new CSV file
output_file_path = '/content/drive/MyDrive/Colab/similarity_scores.csv'
similarity_scores_df.to_csv(output_file_path, index=False)