#  **Step1: Install and import libraries**

In [1]:
!pip install -qq datasets
!pip install -qq sentence-transformers transformers scikit-learn pandas
!pip install -qq gdown
from datasets import load_dataset
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda


# **Step 2: Data Loading**

In [2]:
dataset = load_dataset("eli5_category")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/62.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.00M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.76M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.85M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/91772 [00:00<?, ? examples/s]

Generating validation1 split:   0%|          | 0/5446 [00:00<?, ? examples/s]

Generating validation2 split:   0%|          | 0/2375 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5411 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 91772
    })
    validation1: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 5446
    })
    validation2: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 2375
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 5411
    })
})

In [4]:
# Google Drive file IDs
train_file_id = '11LqNAq0q6WwiaN4xCnins3-u6XwBH1aa'
test_file_id = '1yGZe1xKJcrmRGfXo2TisBftq-Icn7Bhc'

# Download the dataset from Google Drive
!gdown --id {train_file_id} -O generated_1000_train_dataset.csv
!gdown --id {test_file_id} -O generated_1000_test_dataset.csv

# alternative link for manual download if needed
print('If you cannot download the dataset using gdown, please use these links:')
print(f'Train dataset: https://drive.google.com/file/d/11LqNAq0q6WwiaN4xCnins3-u6XwBH1aa/view?usp=sharing')
print(f'Test dataset: https://drive.google.com/file/d/1yGZe1xKJcrmRGfXo2TisBftq-Icn7Bhc/view?usp=sharing')

Downloading...
From: https://drive.google.com/uc?id=11LqNAq0q6WwiaN4xCnins3-u6XwBH1aa
To: /content/generated_1000_train_dataset.csv
100% 894k/894k [00:00<00:00, 136MB/s]
Downloading...
From: https://drive.google.com/uc?id=1yGZe1xKJcrmRGfXo2TisBftq-Icn7Bhc
To: /content/generated_1000_test_dataset.csv
100% 2.83M/2.83M [00:00<00:00, 184MB/s]
If you cannot download the dataset using gdown, please use these links:
Train dataset: https://drive.google.com/file/d/11LqNAq0q6WwiaN4xCnins3-u6XwBH1aa/view?usp=sharing
Test dataset: https://drive.google.com/file/d/1yGZe1xKJcrmRGfXo2TisBftq-Icn7Bhc/view?usp=sharing


##  2-1 Prepare dataset to train

In [5]:
def organize_dataset(data):
    questions = []
    answers_list = []

    for item in data:
        answers = item['answers']['text']
        questions.append(item['title'])
        answers_list.append(answers)

    return {
        "questions": questions,
        "answers": answers_list,

    }

In [6]:
train_data = organize_dataset(dataset["train"])
test_data = organize_dataset(dataset["test"])

# Flatten the nested list of answers
human_answers_train = [answer for sublist in train_data['answers'][:1000] for answer in sublist]
human_answers_test = [answer for sublist in test_data['answers'][:1000] for answer in sublist]

In [7]:
path_train = 'generated_1000_train_dataset.csv'  # Adjust the path as necessary
path_test = 'generated_1000_test_dataset.csv'
answers_df_train = pd.read_csv(path_train)
answers_df_test = pd.read_csv(path_test)

In [8]:
model_answers_train = answers_df_train['Generated_answers'].tolist()
model_answers_test = answers_df_test['Generated_answers'].tolist()

## 2-2 Join all answers together

In [9]:
model_answers = model_answers_train + model_answers_test
human_answers = human_answers_train + human_answers_test


data = pd.DataFrame({
    'answer': human_answers + model_answers,
    'label': ['human'] * len(human_answers) + ['machine'] * len(model_answers)
})


# **Step 3: Model1-Logistic Regression**

##  3-1 Embedding all answers

In [10]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
answer_embeddings = sbert_model.encode(data['answer'].tolist(), convert_to_tensor=True, device=device)

##  3-2 Train-test split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    answer_embeddings.cpu().numpy(), data['label'], test_size=0.2, random_state=42, stratify=data['label']
)

##  3-3 Train Logistic Regression model

In [13]:
# Train a Logistic Regression classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

##  3-4 Model evaluation

In [14]:
# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.8498269896193772
Classification Report:
              precision    recall  f1-score   support

       human       0.86      0.95      0.90      1045
     machine       0.82      0.59      0.69       400

    accuracy                           0.85      1445
   macro avg       0.84      0.77      0.79      1445
weighted avg       0.85      0.85      0.84      1445



## 3-5 Predict input

In [15]:
def classify_input(text):
    text_transformed = sbert_model.encode([text], convert_to_tensor=True, device=device).cpu().numpy()
    prediction = clf.predict(text_transformed)
    return prediction[0]


new_input = "They don't mostly. Depends how rural they are, in the US they mostly do run electricity out to those rural houses. Not always though, many people who actually live that far our just use generators. As for sewers and water, they have wells and a septic system. My old house had those and my Mom's does too, it's not particularly rare and you don't even need to be all that rural to need them. Lots of towns have no sewers, or don't provide it for most people."
predicted_label = classify_input(new_input)
print(f'The predicted label for the input is: {predicted_label}')

The predicted label for the input is: human


# **Step 4: Model2-TF-IDF feature Vectorization and Naive bayes**

## 4-1 TF_IDF Vectorization

In [16]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['answer'])

## 4-2 Train-test Split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, data['label'], test_size=0.4, random_state=42, stratify=data['label']
)


## 4-3 Train Naive Bayes Classifier model

In [18]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

## 4-4 Model Evaluation

In [19]:
# Predict on the test set
y_pred = nb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.7238754325259515
Classification Report:
              precision    recall  f1-score   support

       human       0.72      1.00      0.84      2090
     machine       1.00      0.00      0.00       800

    accuracy                           0.72      2890
   macro avg       0.86      0.50      0.42      2890
weighted avg       0.80      0.72      0.61      2890



## 4-5 Predict Input NB

In [20]:
def classify_input(text):
    text_transformed = vectorizer.transform([text])
    prediction = nb.predict(text_transformed)
    return prediction[0]


new_input = "A black hole is a region of spacetime where gravity is so strong that nothing, not even light, can escape from it."
predicted_label = classify_input(new_input)
print(f'The predicted label for the input is: {predicted_label}')

The predicted label for the input is: human
