In [None]:
!git clone https://github.com/pooja-premnath/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil

Cloning into 'SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 7 (delta 2), reused 7 (delta 2), pack-reused 0[K
Receiving objects: 100% (7/7), 192.82 KiB | 19.28 MiB/s, done.
Resolving deltas: 100% (2/2), done.


In [None]:
import pandas as pd

# Load the original and generated datasets
original_df = pd.read_csv('/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Expert-Annotated Dataset.csv')
generated_df = pd.read_csv('/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/GPT4o Dataset.csv')




## TTR Ratio-Lexical Diversity

In [None]:
def type_token_ratio(text):
    tokens = text.split()
    return len(set(tokens)) / len(tokens)

# Calculate TTR for the entire datasets
original_ttr = original_df['Text'].apply(type_token_ratio).mean()
generated_ttr = generated_df['Text'].apply(type_token_ratio).mean()

# Calculate TTR by category
original_ttr_by_category = original_df.groupby('Category')['Text'].apply(lambda x: x.apply(type_token_ratio).mean())
generated_ttr_by_category = generated_df.groupby('Category')['Text'].apply(lambda x: x.apply(type_token_ratio).mean())

original_ttr, generated_ttr, original_ttr_by_category, generated_ttr_by_category


(0.9981139499407483,
 0.9984322871572872,
 Category
 Biased        1.000000
 Clickbait     0.998617
 Humor         1.000000
 Misleading    0.997376
 Name: Text, dtype: float64,
 Category
 Biased    0.999775
 Humor     0.997090
 Name: Text, dtype: float64)

## N-Gram Overlap

In [None]:
from nltk.util import ngrams
from collections import Counter

def ngram_overlap(text, reference_ngrams, n=2):
    sentence_ngrams = Counter(ngrams(text.split(), n))
    overlap = sum(min(sentence_ngrams[ng], reference_ngrams[ng]) for ng in sentence_ngrams)
    return overlap / max(1, sum(sentence_ngrams.values()))

# Get bigrams from the original dataset
original_ngrams = Counter(ngrams(" ".join(original_df['Text']).split(), 2))

# Calculate n-gram overlap for the entire generated dataset
ngram_overlap_scores = generated_df['Text'].apply(ngram_overlap, reference_ngrams=original_ngrams).mean()

# Calculate n-gram overlap by category
ngram_overlap_by_category = generated_df.groupby('Category')['Text'].apply(lambda x: x.apply(ngram_overlap, reference_ngrams=original_ngrams).mean())

ngram_overlap_scores, ngram_overlap_by_category


(0.03201152597402598,
 Category
 Biased    0.039308
 Humor     0.024715
 Name: Text, dtype: float64)

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained model
model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

# Filter the original dataset to include only "Humor" and "Biased" categories
filtered_original_df = original_df[original_df['Category'].isin(['Humor', 'Biased'])]

# Ensure that the filtered original dataset has the same order of labels as the generated dataset
# Assuming both datasets are aligned or we can sort them based on the category and index
filtered_original_df = filtered_original_df.reset_index(drop=True)
generated_df = generated_df.reset_index(drop=True)

# Load the pre-trained model
model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

# Generate sentence embeddings
original_embeddings = model.encode(filtered_original_df['Text'].tolist())
generated_embeddings = model.encode(generated_df['Text'].tolist())

# Calculate cosine similarity
cosine_similarities = [cosine_similarity([orig], [gen])[0][0] for orig, gen in zip(original_embeddings, generated_embeddings)]
average_cosine_similarity = sum(cosine_similarities) / len(cosine_similarities)

print(f"Average Cosine Similarity: {average_cosine_similarity}")


Average Cosine Similarity: 0.2174960909461653


In [None]:
from transformers import pipeline

# Specify the device to use the GPU (0 for the first GPU, -1 for CPU)
device = 0  # Use GPU; set to -1 if you want to use the CPU

# Load the sentiment analysis pipeline with the specified device
sentiment_pipeline = pipeline("sentiment-analysis", model="ai4bharat/indic-bert", device=device)

# Filter the original dataset to include only "Humor" and "Biased" categories
filtered_original_df = original_df[original_df['Category'].isin(['Humor', 'Biased'])]

# Ensure that the filtered original dataset and the generated dataset are aligned
filtered_original_df = filtered_original_df.reset_index(drop=True)
generated_df = generated_df.reset_index(drop=True)

# Predict sentiments for the filtered datasets
original_sentiments = sentiment_pipeline(filtered_original_df['Text'].tolist())
generated_sentiments = sentiment_pipeline(generated_df['Text'].tolist())

# Compare sentiments to calculate sentiment consistency
sentiment_match = sum(1 for orig, gen in zip(original_sentiments, generated_sentiments) if orig['label'] == gen['label'])
sentiment_consistency = sentiment_match / len(original_sentiments)

print(f"Sentiment Consistency: {sentiment_consistency}")


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sentiment Consistency: 1.0


In [None]:
import numpy as np
from collections import Counter

def shannon_entropy(text):
    tokens = text.split()
    token_counts = Counter(tokens)
    total_count = sum(token_counts.values())
    entropy = -sum((count / total_count) * np.log2(count / total_count) for count in token_counts.values())
    return entropy

# Calculate entropy for the entire datasets
original_entropy = original_df['Text'].apply(shannon_entropy).mean()
generated_entropy = generated_df['Text'].apply(shannon_entropy).mean()

print(f"Original Shannon Entropy: {original_entropy}")
print(f"Generated Shannon Entropy: {generated_entropy}")


Original Shannon Entropy: 3.0754853020250046
Generated Shannon Entropy: 2.566051473490822


In [None]:
from collections import Counter
from math import pow

def simpson_index(text):
    tokens = text.split()
    counts = Counter(tokens).values()
    N = sum(counts)
    simpson = sum((n / N) ** 2 for n in counts)
    return 1 - simpson  # Higher value = more diversity

original_simpson = original_df['Text'].apply(simpson_index).mean()
generated_simpson = generated_df['Text'].apply(simpson_index).mean()

print(f"Original Simpson's Index: {original_simpson}")
print(f"Generated Simpson's Index: {generated_simpson}")


Original Simpson's Index: 0.8767759404930211
Generated Simpson's Index: 0.8233121137542513


In [None]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

original_words = set(" ".join(original_df['Text']).split())
generated_words = set(" ".join(generated_df['Text']).split())

similarity = jaccard_similarity(original_words, generated_words)
print(f"Jaccard Similarity: {similarity}")


Jaccard Similarity: 0.08307749558098428


In [None]:
!pip install language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.8-py3-none-any.whl.metadata (12 kB)
Downloading language_tool_python-2.8-py3-none-any.whl (35 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.8


In [None]:
from language_tool_python import LanguageTool

tool = LanguageTool('ta')  # 'ta' is the language code for Tamil

def check_grammar(sentence):
    matches = tool.check(sentence)
    return len(matches)  # Number of grammar issues

original_grammar_issues = original_df['Text'].apply(check_grammar).mean()
generated_grammar_issues = generated_df['Text'].apply(check_grammar).mean()

print(f"Average Grammar Issues in Original Text: {original_grammar_issues}")
print(f"Average Grammar Issues in Generated Text: {generated_grammar_issues}")


Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:02<00:00, 85.6MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpdlxfjfq3.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /root/.cache/language_tool_python.


Average Grammar Issues in Original Text: 0.45657809462086846
Average Grammar Issues in Generated Text: 0.2345
