In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import spacy
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter
from pycspade.helpers import spade, print_result

In [None]:
# Suppress warnings
warnings.filterwarnings('ignore')

# Load spaCy and NLTK resources
nlp = spacy.load("en_core_web_sm")
nltk.download('punkt')

# Load and prepare dataset
equal_sample_df = pd.read_csv('sampled_df_pos_tags.csv')

# Balance the dataset with equal samples from human and AI sources
human_data = equal_sample_df[equal_sample_df["source"] == 0]
ai_data = equal_sample_df[equal_sample_df["source"] == 1]


In [None]:
# Downsample to ensure equal class distribution
human_sampled = human_data.sample(n=20000, random_state=42)
ai_data = ai_data.sample(n=20000, random_state=42)
equal_sample_df = pd.concat([human_sampled, ai_data], ignore_index=True)
equal_sample_df = equal_sample_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Text preprocessing function
def preprocess_text(text):
    
    text = text.lower()

    text = re.sub(r"[^a-zA-Z',!?\-.\s]", '', text)

    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'\?{2,}', '?', text)
    text = re.sub(r'\!{2,}', '!', text)
    text = re.sub(r',{2,}', ',', text)

    text = re.sub(r'(?<!\s)([.,!?-])(?!\s)', r' \1 ', text)
    
    return text


In [None]:
# Apply preprocessing to the text column
equal_sample_df['processed_text'] = equal_sample_df['text'].apply(preprocess_text)

# Split data into training and testing sets
X = equal_sample_df
y = equal_sample_df['source']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=72, stratify=y)

# Create separate datasets for AI and human text
X_train_ai = X_train[X_train['source'] == 1]
X_train_human = X_train[X_train['source'] == 0]

In [None]:
# Create a dictionary for all unique words in the dataset
passages_all = X_train['processed_text'].tolist()
unique_words = set(word.lower() for passage in passages_all for word in passage.split())
word_to_index = {word: idx for idx, word in enumerate(sorted(unique_words))}
index_to_word = {value: key for key, value in word_to_index.items()}

In [None]:
# Prepare AI dataset for SPADE
passages_ai = X_train_ai['processed_text'].tolist()
final_sentences_split_dataset_ai = []

for seq_idx, passage in enumerate(passages_ai, start=1):
    sentences = sent_tokenize(passage)
    event_id = 1
    for sentence in sentences:
        words = [word_to_index[word.lower()] for word in sentence.split()]
        final_sentences_split_dataset_ai.append([seq_idx, event_id, words])
        event_id += 1


In [None]:
# Prepare human dataset for SPADE
passages_human = X_train_human['processed_text'].tolist()
final_sentences_split_dataset_human = []

for seq_idx, passage in enumerate(passages_human, start=1):
    sentences = sent_tokenize(passage)
    event_id = 1
    for sentence in sentences:
        words = [word_to_index[word.lower()] for word in sentence.split()]
        final_sentences_split_dataset_human.append([seq_idx, event_id, words])
        event_id += 1

In [None]:
# Run SPADE algorithm for human text
result_human = spade(data=final_sentences_split_dataset_human, support=0.25, maxgap=1, mingap=1)

# Run SPADE algorithm for AI text
result_ai = spade(data=final_sentences_split_dataset_ai, support=0.35, maxgap=1, mingap=1)


In [None]:
# Extract sequences from SPADE results for human text
sequences_human = []
for mined_object in result_human['mined_objects']:
    sequences_human.append('->'.join(list(map(str, mined_object.items))))

# Extract sequences from SPADE results for AI text
sequences_ai = []
for mined_object in result_ai['mined_objects']:
    sequences_ai.append('->'.join(list(map(str, mined_object.items))))

In [None]:
# Function to extract numbers from sequence strings
def extract_numbers(string):
    numbers = re.findall(r'\((\d+)\)|\[(\d+)\]', string)
    numbers = [int(num) for pair in numbers for num in pair if num]
    return tuple(numbers)

# Convert sequence strings to number tuples
all_sequences_number_human = []
for i in sequences_human:
    string = extract_numbers(str(i))
    all_sequences_number_human.append(string)

all_sequences_number_ai = []
for i in sequences_ai:
    string = extract_numbers(str(i))
    all_sequences_number_ai.append(string)

# Combine all unique sequences
all_sequences = set(all_sequences_number_human + all_sequences_number_ai)


In [None]:
# Function to generate word sequences from passages
def generate_sequences(passages, word_to_index):
    sequences = []
    for passage in passages:
        sentences = sent_tokenize(passage)
        for sentence in sentences:
            words = [word_to_index[word.lower()] for word in sentence.split() if word.lower() in word_to_index]
            sequences.append(tuple(words))
    return sequences

# Function to check if A is a subsequence of B
def is_subsequence(A, B):
    str_A = ','.join(map(str, A))
    str_B = ','.join(map(str, B))
    return str_B in str_A

# Function to count sequence frequencies in passages
def count_sequence_frequencies(dataset, passages, word_to_index, sequence_dict):
    freq_vector = np.zeros((dataset.shape[0], len(sequence_dict)))
    for passage, passage_num in zip(passages, range(0, dataset.shape[0])):
        passage_sequences = generate_sequences([passage], word_to_index)
        
        for seq, seq_num in zip(sequence_dict, range(0, len(sequence_dict))):
            for passage_seq in passage_sequences:
                if is_subsequence(seq, passage_seq):
                    freq_vector[passage_num, seq_num] += 1
    return freq_vector

In [None]:
# Create frequency vectors for training and testing data
passages_train = X_train['processed_text'].tolist()
train_frequency_vectors = count_sequence_frequencies(X_train, passages_train, word_to_index, all_sequences)

passages_test = X_test['processed_text'].tolist()
test_frequency_vectors = count_sequence_frequencies(X_test, passages_test, word_to_index, all_sequences)

# Convert frequency vectors to DataFrames
df_train = pd.DataFrame(train_frequency_vectors, columns=[f"seq_{i}" for i in range(len(all_sequences))])
df_test = pd.DataFrame(test_frequency_vectors, columns=[f"seq_{i}" for i in range(len(all_sequences))])


In [None]:
# Build and evaluate Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(df_train, y_train)
lr_pred = lr_model.predict(df_test)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, lr_pred):.2f}")
print("Classification Report:\n", classification_report(y_test, lr_pred))

In [None]:
# Build and evaluate SVM with linear kernel
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(df_train, y_train)
svm_pred = svm_model.predict(df_test)
print(f"SVM (Linear) Accuracy: {accuracy_score(y_test, svm_pred):.2f}")
print("Classification Report:\n", classification_report(y_test, svm_pred))

In [None]:
# Build and evaluate SVM with RBF kernel
svm_rbf_model = SVC(kernel='rbf', random_state=42)
svm_rbf_model.fit(df_train, y_train)
svm_rbf_pred = svm_rbf_model.predict(df_test)
print(f"SVM (RBF) Accuracy: {accuracy_score(y_test, svm_rbf_pred):.2f}")
print("Classification Report:\n", classification_report(y_test, svm_rbf_pred))

In [None]:
# Build and evaluate Random Forest model
rf_model = RandomForestClassifier(n_estimators=300, random_state=42, criterion='entropy')
rf_model.fit(df_train, y_train)
rf_pred = rf_model.predict(df_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred):.2f}")
print("Classification Report:\n", classification_report(y_test, rf_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))

In [None]:
# Function to convert sequence numbers back to words
def extract_and_concatenate(string, word_dict):
    numbers = re.findall(r'\((\d+)\)|\[(\d+)\]', string)
    numbers = [int(num) for pair in numbers for num in pair if num]
    words = [word_dict.get(num, "") for num in numbers]
    return " ".join(words)

In [None]:
# Convert sequences to readable words for analysis
all_sequences_words_human = []
for i in sequences_human:
    string = extract_and_concatenate(str(i), index_to_word)
    all_sequences_words_human.append(string)