In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Data Preparation
data = pd.read_csv('AI_Human.csv')

df = pd.DataFrame(data)

# Step 2: Feature Extraction using TF-IDF (unigrams)
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text']).toarray()
y = df['generated']

# Step 3: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Model Training
# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Train Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Step 5: Model Evaluation
# Predict with Naive Bayes
nb_probs = nb_model.predict_proba(X_test)[:, 1]
nb_predictions = nb_model.predict(X_test)

# Predict with Logistic Regression
lr_probs = lr_model.predict_proba(X_test)[:, 1]
lr_predictions = lr_model.predict(X_test)

# Combine predictions (e.g., averaging probabilities)
combined_probs = (nb_probs + lr_probs) / 2
combined_predictions = (combined_probs > 0.5).astype(int)

# Print Evaluation Metrics
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))

print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))
print("Logistic Regression Classification Report:\n", classification_report(y_test, lr_predictions))

print("Combined Model Accuracy:", accuracy_score(y_test, combined_predictions))
print("Combined Model Classification Report:\n", classification_report(y_test, combined_predictions))

# Step 6: Make Prediction
# Predict with Naive Bayes
new_text_vector = vectorizer.transform(["Artificial Intelligence is going to be the supreme concern in few decades."]).toarray()
nb_new_text_probs = nb_model.predict_proba(new_text_vector)[:, 1]
nb_new_text_prediction = nb_model.predict(new_text_vector)

# Predict with Logistic Regression
lr_new_text_probs = lr_model.predict_proba(new_text_vector)[:, 1]
lr_new_text_prediction = lr_model.predict(new_text_vector)

# Combine predictions
combined_new_text_probs = (nb_new_text_probs + lr_new_text_probs) / 2
combined_new_text_prediction = (combined_new_text_probs > 0.5).astype(int)

print("Naive Bayes New Text Probabilities:", nb_new_text_probs)
print("Naive Bayes New Text Prediction (0 = Human, 1 = AI):", nb_new_text_prediction[0])

print("Logistic Regression New Text Probabilities:", lr_new_text_probs)
print("Logistic Regression New Text Prediction (0 = Human, 1 = AI):", lr_new_text_prediction[0])

print("Combined New Text Probabilities:", combined_new_text_probs)
print("Combined New Text Prediction (0 = Human, 1 = AI):", combined_new_text_prediction[0])


Naive Bayes Accuracy: 0.9162556184195223
Naive Bayes Classification Report:
               precision    recall  f1-score   support

         0.0       0.90      0.98      0.94     91597
         1.0       0.96      0.81      0.88     54574

    accuracy                           0.92    146171
   macro avg       0.93      0.90      0.91    146171
weighted avg       0.92      0.92      0.91    146171

Logistic Regression Accuracy: 0.9868646995642091
Logistic Regression Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     91597
         1.0       0.99      0.98      0.98     54574

    accuracy                           0.99    146171
   macro avg       0.99      0.99      0.99    146171
weighted avg       0.99      0.99      0.99    146171

Combined Model Accuracy: 0.9797018560453168
Combined Model Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      0.99     