In [8]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib

In [9]:
# Download stopwords if not already present
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91700\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Load dataset
df = pd.read_csv("C:/Users/91700/Downloads/updated_dataset.csv")  # Change path if needed


In [11]:
# Check class balance
print("Class Distribution:\n", df['generated'].value_counts())

# Balance dataset (if needed)
df_human = df[df['generated'] == 0]
df_ai = df[df['generated'] == 1].sample(len(df_human), random_state=42)  # Undersample AI texts
df_balanced = pd.concat([df_human, df_ai]).reset_index(drop=True)

print("Balanced Class Distribution:\n", df_balanced['generated'].value_counts())


Class Distribution:
 generated
1    277
0     94
Name: count, dtype: int64
Balanced Class Distribution:
 generated
0    94
1    94
Name: count, dtype: int64


In [12]:
# Text Preprocessing Function
def clean_text(text):
    if pd.isna(text):
        return ""  # Handle NaN values
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])  # Remove stopwords
    return text

# Apply preprocessing
df_balanced['text'] = df_balanced['text'].apply(clean_text)

In [13]:
# Train-Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(df_balanced['text'], df_balanced['generated'], test_size=0.2, random_state=42)


In [14]:
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Save vectorizer for future use
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']

In [15]:
# Train SVM Model
svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(X_train_tfidf, y_train)

# Evaluate SVM Model
y_pred_svm = svm_model.predict(X_test_tfidf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Model Accuracy: {accuracy_svm:.2f}")

# Save the trained SVM model
joblib.dump(svm_model, "svm_ai_text_detector.pkl")

# Load Model & Vectorizer for Prediction
svm_model = joblib.load("svm_ai_text_detector.pkl")
vectorizer = joblib.load("vectorizer.pkl")

SVM Model Accuracy: 0.32


In [16]:
# AI Text Detection Function
def detect_text(input_text):
    """
    Function to detect whether a given text is AI-generated or human-written.
    """
    cleaned_text = clean_text(input_text)  # Preprocess input
    vectorized_text = vectorizer.transform([cleaned_text])  # Convert to TF-IDF
    prediction = svm_model.predict(vectorized_text)[0]  # Predict
    
    return "AI-Generated" if prediction == 1 else "Human-Written"

# Example Usage (Test with a known human-written sentence)
input_text = "myself soham"
result = detect_text(input_text)
print(f"Detection Result: {result}")


Detection Result: Human-Written
