In [8]:

# Install required packages
!pip install pandas scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the cleaned combined dataset
file_path = "/content/drive/My Drive/NLP/combined_languages_data_clean.csv"
df = pd.read_csv(file_path)

# Use clean_text and map type to binary label
df = df.dropna(subset=["clean_text", "type"])
df = df[df['type'].isin(["human_text", "machine_generated"])]
df['label'] = df['type'].map({'human_text': 0, 'machine_generated': 1})

# Filter short texts
df = df[df['clean_text'].str.split().str.len() > 10]

# Prepare input and output
X = df['clean_text']
y = df['label']

print(f"Dataset shape: {df.shape}")
print(f"Label distribution:\n{df['label'].value_counts()}")

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# TF-IDF vectorisation
vectorizer = TfidfVectorizer(max_features=3000, min_df=5, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# Train logistic regression model
clf = LogisticRegression(max_iter=300, C=0.5)
clf.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=["Human", "Machine"]))



Dataset shape: (7379, 8)
Label distribution:
label
0    4366
1    3013
Name: count, dtype: int64
              precision    recall  f1-score   support

       Human       0.84      0.95      0.89       873
     Machine       0.91      0.74      0.82       603

    accuracy                           0.86      1476
   macro avg       0.88      0.85      0.85      1476
weighted avg       0.87      0.86      0.86      1476

