In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  **Step1: Install and import libraries**

In [1]:
!pip install datasets



In [2]:
!pip install sentence-transformers transformers scikit-learn pandas


Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0


In [3]:
from datasets import load_dataset
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [4]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# **Step 2: Data Loading**

In [5]:
dataset = load_dataset("eli5_category")
dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/62.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.00M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.76M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.85M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/91772 [00:00<?, ? examples/s]

Generating validation1 split:   0%|          | 0/5446 [00:00<?, ? examples/s]

Generating validation2 split:   0%|          | 0/2375 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5411 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 91772
    })
    validation1: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 5446
    })
    validation2: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 2375
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 5411
    })
})

#  2-1 Prepare dataset to train

In [6]:
def organize_dataset(data):
    questions = []
    answers_list = []

    for item in data:
        answers = item['answers']['text']
        questions.append(item['title'])
        answers_list.append(answers)

    return {
        "questions": questions,
        "answers": answers_list,

    }

In [7]:
train_data = organize_dataset(dataset["train"])
test_data = organize_dataset(dataset["test"])

# Flatten the nested list of answers
human_answers_train = [answer for sublist in train_data['answers'][:1000] for answer in sublist]
human_answers_test = [answer for sublist in test_data['answers'][:1000] for answer in sublist]

In [8]:
path_train = '/kaggle/input/generated-1000-train-dataset/generated_1000_train_dataset.csv'
path_test = '/kaggle/input/generated-1000-test-dataset/generated_1000_test_dataset.csv'
answers_df_train = pd.read_csv(path_train)
answers_df_test = pd.read_csv(path_test)

In [9]:
model_answers_train = answers_df_train['Generated_answers'].tolist()
model_answers_test = answers_df_test['Generated_answers'].tolist()

# 2-2 Join all answers together

In [10]:
model_answers = model_answers_train + model_answers_test
human_answers = human_answers_train + human_answers_test


data = pd.DataFrame({
    'answer': human_answers + model_answers,
    'label': ['human'] * len(human_answers) + ['machine'] * len(model_answers)
})


#  *Logistic Regression* 

#  Embedding all answers

In [37]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [38]:
answer_embeddings = sbert_model.encode(data['answer'].tolist(), convert_to_tensor=True, device=device)

Batches:   0%|          | 0/226 [00:00<?, ?it/s]

#  Train-test split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    answer_embeddings.cpu().numpy(), data['label'], test_size=0.2, random_state=42, stratify=data['label']
)

#  Train Logistic Regression model

In [40]:
# Train a Logistic Regression classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

#  Model evaluation

In [41]:
# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.8498269896193772
Classification Report:
              precision    recall  f1-score   support

       human       0.86      0.95      0.90      1045
     machine       0.82      0.59      0.69       400

    accuracy                           0.85      1445
   macro avg       0.84      0.77      0.79      1445
weighted avg       0.85      0.85      0.84      1445



# Predict input

In [42]:

def classify_input(text):
    text_transformed = sbert_model.encode([text], convert_to_tensor=True, device=device).cpu().numpy()
    prediction = clf.predict(text_transformed)
    return prediction[0]

# Example usage
new_input = "They don't mostly. Depends how rural they are, in the US they mostly do run electricity out to those rural houses. Not always though, many people who actually live that far our just use generators. As for sewers and water, they have wells and a septic system. My old house had those and my Mom's does too, it's not particularly rare and you don't even need to be all that rural to need them. Lots of towns have no sewers, or don't provide it for most people."
predicted_label = classify_input(new_input)
print(f'The predicted label for the input is: {predicted_label}')

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The predicted label for the input is: human


# *TF-IDF feature Vectorization and Naive bayes*

#  TF_IDF Vectorization

In [11]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['answer'])

# Train-test Split

In [12]:
# Split the data into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, data['label'], test_size=0.4, random_state=42, stratify=data['label']
)


# Train Naive Bayes Classifier model

In [13]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Model Evaluation

In [15]:
# Predict on the test set
y_pred = nb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.7238754325259515
Classification Report:
              precision    recall  f1-score   support

       human       0.72      1.00      0.84      2090
     machine       1.00      0.00      0.00       800

    accuracy                           0.72      2890
   macro avg       0.86      0.50      0.42      2890
weighted avg       0.80      0.72      0.61      2890



# Predict Input NB

In [16]:
def classify_input(text):
    text_transformed = vectorizer.transform([text])
    prediction = nb.predict(text_transformed)
    return prediction[0]

# Example usage
new_input = "A black hole is a region of spacetime where gravity is so strong that nothing, not even light, can escape from it."
predicted_label = classify_input(new_input)
print(f'The predicted label for the input is: {predicted_label}')

The predicted label for the input is: human
