In [None]:
import nltk
from nltk.util import ngrams
from gensim.models import Word2Vec
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')


In [None]:

# Uncomment these lines when using Google Colab
# from google.colab import drive
# drive.mount('/content/drive')


In [None]:

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('punkt_tab')


In [None]:

# Load the dataset
equal_sample_df = pd.read_csv('path to data')


In [None]:

# Helper functions for text processing
def remove_punctuation(text):
    """Remove all punctuation from text."""
    return text.translate(str.maketrans("", "", string.punctuation))

def generate_ngrams_range(text, n_start=2, n_end=2):
    """Generate n-grams for a range of n values."""
    tokens = nltk.word_tokenize(text)
    all_ngrams = []
    for n in range(n_start, n_end + 1):
        n_grams = list(ngrams(tokens, n))
        all_ngrams.extend([' '.join(gram) for gram in n_grams])
    return all_ngrams


In [None]:

# Separate the data by label and create balanced samples
human_data = equal_sample_df[equal_sample_df["source"] == 0]
ai_data = equal_sample_df[equal_sample_df["source"] == 1]

# Downsample to ensure equal representation (5000 samples each)
human_sampled = human_data.sample(n=5000, random_state=42)
ai_data = ai_data.sample(n=5000, random_state=42)

# Preprocess AI data
ai_data["pos_tags"] = ai_data['pos_tags'].apply(remove_punctuation)
ai_data["ngrams"] = ai_data['pos_tags'].apply(lambda x: generate_ngrams_range(x, n_start=2, n_end=3))


In [None]:

# Create Word2Vec model from the n-grams
model = Word2Vec(
    ai_data['ngrams'].to_list(),
    vector_size=50,
    window=2,
    min_count=3,
    sg=0,
    max_vocab_size=100000
)

# Extract vectors for each n-gram
vectors = model.wv.vectors


In [None]:

# Perform hierarchical clustering with Ward's method
linkage_matrix = linkage(vectors, method='ward')

# Define the distance threshold for clusters
threshold = 10
cluster_labels = fcluster(linkage_matrix, threshold, criterion='distance')


In [None]:
# Extract the n-grams list for analysis
ngrams_list = ai_data['ngrams'].tolist()

# Filter ngrams_list to align with Word2Vec vocabulary
filtered_ngrams_list = [ngrams for ngrams in ngrams_list if any(ngram in model.wv for ngram in ngrams)]


In [None]:

# Ensure cluster_labels and filtered_ngrams_list have the same length
cluster_labels = cluster_labels[:len(filtered_ngrams_list)]

# Organize n-grams by cluster
clusters = {i: [] for i in set(cluster_labels)}
for i, label in enumerate(cluster_labels):
    clusters[label].append(filtered_ngrams_list[i])

# Analyze clusters to extract features
selected_clusters = pd.Series(cluster_labels).value_counts()


In [None]:
# Get the top 20 and bottom 10 clusters
top_clusters = selected_clusters.head(20).index.tolist()
bottom_clusters = selected_clusters.tail(10).index.tolist()
clusters_to_process = top_clusters + bottom_clusters

# Initialize storage for features
features = []
unique_features = set()


In [None]:

# Process clusters to extract meaningful features
for cluster_id in clusters_to_process:
    # Compute mean values for the cluster
    cluster_indices = np.where(cluster_labels == cluster_id)[0]

    # Filter ai_data['ngrams'] based on cluster indices
    cluster_ngrams = [ai_data['ngrams'].iloc[i] for i in cluster_indices]

    # Flatten the list of ngrams for the cluster
    flat_cluster_ngrams = [ngram for sublist in cluster_ngrams for ngram in sublist]

    # Convert ngrams to vectors using the Word2Vec model
    cluster_vectors = [model.wv[ngram] for ngram in flat_cluster_ngrams if ngram in model.wv]

    # Calculate cluster mean if cluster_vectors is not empty
    if cluster_vectors:
        cluster_mean = np.mean(cluster_vectors, axis=0)

        # Get top and bottom indices
        top_indices = np.argsort(cluster_mean)[-40:][::-1]  # Top 40 features
        bottom_indices = np.argsort(cluster_mean)[:40]      # Bottom 40 features

        # Add features to storage
        top_features = [model.wv.index_to_key[i] for i in top_indices]
        top_clusters = top_features
        unique_features.update(top_features)

        bottom_features = [model.wv.index_to_key[i] for i in bottom_indices]
        bottom_clusters = bottom_features
        unique_features.update(bottom_features)
    else:
        print(f"Cluster {cluster_id} has no ngrams in the Word2Vec model vocabulary.")