<a href="https://colab.research.google.com/github/shill7/APS360_Project/blob/main/Project_Grace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
from PIL import Image
import urllib.request
import numpy as np
import time
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms
import random
from sklearn.model_selection import train_test_split
import shutil

import pandas as pd
import contractions
import re
import nltk
from nltk.tokenize import word_tokenize
from datasketch import MinHash, MinHashLSH
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [None]:
%pip install contractions
%pip install datasketch

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (113 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.9/113.9 kB[0m 

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Loading, Preprocessing, and Splitting Dataset

In [None]:
def clean_text(text):
  # Expand contractions
  text = contractions.fix(text)

  # Remove non-alphabelic/numeric symbols except basic punctuations
  text = re.sub(r'[^\w\s.,!?\'":;()]', '', text)

  # Normalize whitespace
  text = re.sub(r'\s+', ' ', text).strip()

  # Lowercase
  text = text.lower()

  # Remove short text
  if len(text.split()) < 350 or len(nltk.sent_tokenize(text)) < 2:
    return None

  return text

def deduplication(df, text_col='text_clean', threshold=0.9, num_perm=128):
  # Initialize MinHashLSH
  lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
  minhashes = {}

  # Creating MinHash and LSH index
  for idx, text in df[text_col].items():
    tokens = set(word_tokenize(text))
    minhash = MinHash(num_perm=num_perm)
    for token in tokens:
      minhash.update(token.encode('utf8'))
    lsh.insert(idx, minhash)
    minhashes[idx] = minhash

  # Finding duplicates
  remove_data = set()
  for idx in df.index:
    if idx in remove_data:
      continue
    duplicates = lsh.query(minhashes[idx])
    duplicates = [i for i in duplicates if i != idx]
    remove_data.update(duplicates)

  deduped_df = df.drop(index=remove_data)
  print(f"Remaining data after deduplication: {len(deduped_df)}")
  return deduped_df

In [None]:
def splitDataset(data_path):
  # Reading dataset
  df = pd.read_csv(os.path.join(data_path, 'Kaggle', 'AI_Human.csv'))

  # Clean
  df_cleaned = df.copy()
  df_cleaned['text_clean'] = df_cleaned['text'].apply(clean_text)
  df_cleaned = df_cleaned.dropna(subset=['text_clean'])
  print(f"Remaining data after cleaning: {len(df_cleaned)}")

  # Deduplication
  df_deduped = deduplication(df_cleaned)
  print(f"Remaining data after deduplication: {len(df_deduped)}")

  human_df = df_deduped[df_deduped['generated'] == 0]
  ai_df = df_deduped[df_deduped['generated'] == 1]

  # Getting preprocessed text for human and AI
  human_text = human_df['text_clean']
  ai_text = ai_df['text_clean']

  # Train/Temp Split (70% train, 30% temp)
  human_train, human_temp = train_test_split(human_text, test_size=0.30, random_state=42)
  ai_train, ai_temp = train_test_split(ai_text, test_size=0.30, random_state=42)
  # Val/Test Split (30% temp --> 15% Val, 15% Test)
  human_val, human_test = train_test_split(human_temp, test_size=0.50, random_state=42)
  ai_val, ai_test = train_test_split(ai_temp, test_size=0.50, random_state=42)

  # Save to CSV files
  human_train.to_csv(os.path.join(data_path, 'human_train.csv'), index=False, header=True)
  human_val.to_csv(os.path.join(data_path, 'human_val.csv'), index=False, header=True)
  human_test.to_csv(os.path.join(data_path, 'human_test.csv'), index=False, header=True)

  ai_train.to_csv(os.path.join(data_path, 'ai_train.csv'), index=False, header=True)
  ai_val.to_csv(os.path.join(data_path, 'ai_val.csv'), index=False, header=True)
  ai_test.to_csv(os.path.join(data_path, 'ai_test.csv'), index=False, header=True)


In [None]:
def loadTrainValTestData(data_path):
  # Load data
  human_train = pd.read_csv(os.path.join(data_path, 'human_train.csv'))
  human_val = pd.read_csv(os.path.join(data_path,'human_val.csv'))
  human_test = pd.read_csv(os.path.join(data_path,'human_test.csv'))
  ai_train = pd.read_csv(os.path.join(data_path, 'ai_train.csv'))
  ai_val = pd.read_csv(os.path.join(data_path, 'ai_val.csv'))
  ai_test = pd.read_csv(os.path.join(data_path, 'ai_test.csv'))

  # Add labels
  human_train['label'] = 0
  human_val['label'] = 0
  human_test['label'] = 0
  ai_train['label'] = 1
  ai_val['label'] = 1
  ai_test['label'] = 1

  return human_train, human_val, human_test, ai_train, ai_val, ai_test

In [None]:
# data_path = '/content/drive/My Drive/UofT/APS360 - Project/Data' # Different for everyone
# splitDataset(data_path) # Already ran it once

In [None]:
data_path = '/content/drive/My Drive/UofT/APS360 - Project/Data' # Different for everyone
human_train, human_val, human_test, ai_train, ai_val, ai_test = loadTrainValTestData(data_path)
print("Training set sizes:")
print("  Human:", len(human_train))
print("  AI:   ", len(ai_train))

print("\nValidation set sizes:")
print("  Human:", len(human_val))
print("  AI:   ", len(ai_val))

print("\nTest set sizes:")
print("  Human:", len(human_test))
print("  AI:   ", len(ai_test))

Training set sizes:
  Human: 56519
  AI:    31794

Validation set sizes:
  Human: 12111
  AI:    6813

Test set sizes:
  Human: 12112
  AI:    6814


In [None]:
text = """Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built the first ModelT. Cars have played a major role in our every day lives since then. But now, people are starting to question if limiting car usage would be a good thing. To me, limiting the use of cars might be a good thing to do.

In like matter of this, article, "In German Suburb, Life Goes On Without Cars," by Elizabeth Rosenthal states, how automobiles are the linchpin of suburbs, where middle class families from either Shanghai or Chicago tend to make their homes. Experts say how this is a huge impediment to current efforts to reduce greenhouse gas emissions from tailpipe. Passenger cars are responsible for 12 percent of greenhouse gas emissions in Europe...and up to 50 percent in some carintensive areas in the United States. Cars are the main reason for the greenhouse gas emissions because of a lot of people driving them around all the time getting where they need to go. Article, "Paris bans driving due to smog," by Robert Duffer says, how Paris, after days of nearrecord pollution, enforced a partial driving ban to clear the air of the global city. It also says, how on Monday, motorist with evennumbered license plates were ordered to leave their cars at home or be fined a 22euro fine 31. The same order would be applied to oddnumbered plates the following day. Cars are the reason for polluting entire cities like Paris. This shows how bad cars can be because, of all the pollution that they can cause to an entire city.

Likewise, in the article, "Carfree day is spinning into a big hit in Bogota," by Andrew Selsky says, how programs that's set to spread to other countries, millions of Columbians hiked, biked, skated, or took the bus to work during a carfree day, leaving streets of this capital city eerily devoid of traffic jams. It was the third straight year cars have been banned with only buses and taxis permitted for the Day Without Cars in the capital city of 7 million. People like the idea of having carfree days because, it allows them to lesson the pollution that cars put out of their exhaust from people driving all the time. The article also tells how parks and sports centers have bustled throughout the city uneven, pitted sidewalks have been replaced by broad, smooth sidewalks rushhour restrictions have dramatically cut traffic and new restaurants and upscale shopping districts have cropped up. Having no cars has been good for the country of Columbia because, it has aloud them to repair things that have needed repairs for a long time, traffic jams have gone down, and restaurants and shopping districts have popped up, all due to the fact of having less cars around.

In conclusion, the use of less cars and having carfree days, have had a big impact on the environment of cities because, it is cutting down the air pollution that the cars have majorly polluted, it has aloud countries like Columbia to repair sidewalks, and cut down traffic jams. Limiting the use of cars would be a good thing for America. So we should limit the use of cars by maybe riding a bike, or maybe walking somewhere that isn't that far from you and doesn't need the use of a car to get you there. To me, limiting the use of cars might be a good thing to do."""
cleaned = clean_text(text)

print(f"Original text:\n{text}\n")
print(f"Cleaned text:\n{cleaned}")

Original text:
Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built the first ModelT. Cars have played a major role in our every day lives since then. But now, people are starting to question if limiting car usage would be a good thing. To me, limiting the use of cars might be a good thing to do.

In like matter of this, article, "In German Suburb, Life Goes On Without Cars," by Elizabeth Rosenthal states, how automobiles are the linchpin of suburbs, where middle class families from either Shanghai or Chicago tend to make their homes. Experts say how this is a huge impediment to current efforts to reduce greenhouse gas emissions from tailpipe. Passenger cars are responsible for 12 percent of greenhouse gas emissions in Europe...and up to 50 percent in some carintensive areas in the United States. Cars are the main reason for the greenhouse gas emissions because of a lot of people driving them around all the time getting where they need to

In [None]:
text = """"It's official: The electoral college is unfair, outdated, and irrational" Plumer, Source 2. Many do not like the electoral college for these reasons and many others such as it can be a disaster or because it is just plain dumb. Also there are a few reasons why the electoral college should be kept such as avoiding runoff elections or big states, but those not in favor of it out weigh those in favor of it. The people who despise the electoral college are in favor of popular vote since it is the better choice.

For various reasons the electoral college is unfair such as not everyones decisions count just those few people in the electoral college. In a popular vote election everyones vote counts not just those who are considered better than us because they hold authority over people. Those people can also be sneaky and can change votes to be in favor of their choice of president. They will even take bribes sometimes just because they can even though us other people do count.

The system should not even be here today because it is outdated way past our time. "It's hard to say this, but Bob Dole was right: Abolsi the electoral college!" Plumer, Source 2 and many others do agree with this statement because it rather true that we do so instead of let a bunch of monkeys run our states and country, but I am pretty sure that sometimes they could even do a better job than those in office right now. ".....over 60 percent of voters would prefer a direct election to the kind we have now" Plumer, Source 2 every day as we continue that percentage continues to grow and that data was recorded in 2000.

Lastly, the electoral college is irrational like seriously what idiotic person came up with this. I will say this again, but a monkey could of made a better system than this. "Under the electoral college system, voters vote not for the president, but for a slate of electors, who in turn elect the president........Who are the electors? They can be anyone not holding public office. Who picks the electors in the first place? It depends on the state. Sometimes state conventions, sometimes the state party's central committee, soemtimes the presidential candidate themselves. Can voters control whom their electors vote for? Not always. DO voters sometimes get confused about the electors and vote for the wrong candiate? Sometimes" Plumer, Source 2 I know this statement says it all because how could one simply not want popular vote after reading this.

I know that electoral college vote can help and not cause problems, but there are more problems while there is one easy fix which is popular vote. "It's official: The electoral college is unfair, outdated, and irrational" Plumer, Source 2."""

cleaned = clean_text(text)

print(f"Original text:\n{text}\n")
print(f"Cleaned text:\n{cleaned}")

Original text:
"It's official: The electoral college is unfair, outdated, and irrational" Plumer, Source 2. Many do not like the electoral college for these reasons and many others such as it can be a disaster or because it is just plain dumb. Also there are a few reasons why the electoral college should be kept such as avoiding runoff elections or big states, but those not in favor of it out weigh those in favor of it. The people who despise the electoral college are in favor of popular vote since it is the better choice.

For various reasons the electoral college is unfair such as not everyones decisions count just those few people in the electoral college. In a popular vote election everyones vote counts not just those who are considered better than us because they hold authority over people. Those people can also be sneaky and can change votes to be in favor of their choice of president. They will even take bribes sometimes just because they can even though us other people do count

## Extracting Features

In [None]:
def extractFeatures(text):
  sentences = nltk.sent_tokenize(text)
  words_characters = nltk.word_tokenize(text)
  words = [word for word in words_characters if word.isalpha()]
  stop_words = set(nltk.corpus.stopwords.words('english'))

  # Features
  # 1. Average sentence length
  total_words = 0
  for s in sentences:
    total_words += len(word_tokenize(s))
  avg_sentence_length = total_words / len(sentences)

  # 2. Average word length
  total_characters = 0
  for w in words:
    total_characters += len(w)
  avg_word_length = total_characters / len(words)

  # 3. Stopword ratio
  stopword_ratio = len([w for w in words if w in stop_words]) / len(words)

  # 4. Lexical diversity
  lexical_diversity = len(set(words)) / len(words)

  return np.array([avg_sentence_length, avg_word_length, stopword_ratio, lexical_diversity], dtype=np.float32)

In [None]:
data_path = '/content/drive/My Drive/UofT/APS360 - Project/Data' # Different for everyone
human_train, human_val, human_test, ai_train, ai_val, ai_test = loadTrainValTestData(data_path)

In [None]:
train_df = pd.concat([human_train, ai_train], ignore_index=True)
val_df = pd.concat([human_val, ai_val], ignore_index=True)
test_df = pd.concat([human_test, ai_test], ignore_index=True)

X_train = np.stack(train_df['text_clean'].apply(extractFeatures))
y_train = train_df['label'].values

X_val = np.stack(val_df['text_clean'].apply(extractFeatures))
y_val = val_df['label'].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
def saveToNumpy(data_path, file, name):
  np.save(os.path.join(data_path, f'{name}.npy'), file)

def loadFromNumpy(data_path, name):
  return np.load(os.path.join(data_path, f'{name}.npy'))


### Saving and Loading the Extracted Features
To avoid reruning the extraction function

In [None]:
temp_path = '/content/drive/My Drive/UofT/APS360 - Project/Data/variables'
saveToNumpy(temp_path, X_train, 'X_train')
saveToNumpy(temp_path, X_train_scaled, 'X_train_scaled')
saveToNumpy(temp_path, y_train, 'y_train')
saveToNumpy(temp_path, X_val, 'X_val')
saveToNumpy(temp_path, X_val_scaled, 'X_val_scaled')
saveToNumpy(temp_path, y_val, 'y_val')

In [None]:
temp_path = '/content/drive/My Drive/UofT/APS360 - Project/Data/variables'
X_train2 = loadFromNumpy(temp_path, 'X_train')
X_train_scaled2 = loadFromNumpy(temp_path, 'X_train_scaled')
y_train2 = loadFromNumpy(temp_path, 'y_train')
X_val2 = loadFromNumpy(temp_path, 'X_val')
X_val_scaled2 = loadFromNumpy(temp_path, 'X_val_scaled')
y_val2 = loadFromNumpy(temp_path, 'y_val')

## Baseline Model

In [None]:
# Parameters set based on proposal
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_model.fit(X_train_scaled2, y_train2)

# Validation
val_preds = svm_model.predict(X_val_scaled2)
print("Validation Results:\n", classification_report(y_val2, val_preds, target_names=["Human", "AI"]))

Validation Results:
               precision    recall  f1-score   support

       Human       0.82      0.95      0.88     12111
          AI       0.89      0.63      0.73      6813

    accuracy                           0.84     18924
   macro avg       0.85      0.79      0.81     18924
weighted avg       0.84      0.84      0.83     18924

