In [None]:
!pip install gradio --quiet

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import gradio as gr

In [None]:
df = pd.read_csv('data.csv')
df = df[['source_txt', 'plagiarism_txt', 'label']]

In [None]:
df = df.dropna(subset=['source_txt', 'plagiarism_txt', 'label'])

In [None]:
df['label'] = df['label'].astype(int)

In [None]:
print("NaN in label", df['label'].isna().sum())
print("Unique labels", df['label'].unique())

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # remove extra whitespace
    text = re.sub(r'\d+', '', text)   # remove digits
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text.strip()

In [None]:
df['source_txt'] = df['source_txt'].astype(str).apply(clean_text)
df['plagiarism_txt'] = df['plagiarism_txt'].astype(str).apply(clean_text)

In [None]:
df['combined'] = df['source_txt'] + ' [SEP] ' + df['plagiarism_txt']

In [None]:

!pip install -q sentence-transformers


from sentence_transformers import SentenceTransformer


st_model = SentenceTransformer('all-MiniLM-L6-v2')  # ~80MB, fast and semantic

source_embeddings = st_model.encode(df['source_txt'].tolist(), show_progress_bar=True)
suspect_embeddings = st_model.encode(df['plagiarism_txt'].tolist(), show_progress_bar=True)

import numpy as np
X = np.hstack((source_embeddings, suspect_embeddings))

y = df['label']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
lr_model = LogisticRegression(max_iter=1000)

In [None]:
lr_model.fit(X_train, y_train)

In [None]:
import matplotlib.pyplot as plt

In [None]:
lr_preds = lr_model.predict(X_test)
print("🔍 Logistic Regression Classification Report")
print(classification_report(y_test, lr_preds))
cm_lr = confusion_matrix(y_test, lr_preds)
disp_lr = ConfusionMatrixDisplay(confusion_matrix=cm_lr, display_labels=["Original", "Plagiarized"])
disp_lr.plot(cmap='Greens')
plt.title("Logistic Regression Confusion Matrix")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


In [None]:
rf_model.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred_rf = rf_model.predict(X_test)
print("🔍 Random Forest Classification Report:\n")
print(classification_report(y_test, y_pred_rf))

In [None]:
cm_lr = confusion_matrix(y_test, lr_preds)
disp_lr = ConfusionMatrixDisplay(confusion_matrix=cm_lr, display_labels=["Original", "Plagiarized"])
disp_lr.plot(cmap='Blues')
plt.title("Random Forest tree matrix")
plt.show()

In [None]:
lr_report = classification_report(y_test, lr_preds, output_dict=True)

In [None]:
rf_report = classification_report(y_test, y_pred_rf, output_dict=True)

In [None]:
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [lr_report['accuracy'], rf_report['accuracy']],
    'Precision': [lr_report['weighted avg']['precision'], rf_report['weighted avg']['precision']],
    'Recall': [lr_report['weighted avg']['recall'], rf_report['weighted avg']['recall']],
    'F1-score': [lr_report['weighted avg']['f1-score'], rf_report['weighted avg']['f1-score']]
})


In [None]:
import pandas as pd

# Assuming comparison_df is your DataFrame
print("Model Comparison Results:")
display(comparison_df)

In [None]:
def predict_plagiarism(source, suspect):
    src_emb = st_model.encode([clean_text(source)])
    sus_emb = st_model.encode([clean_text(suspect)])
    combined_emb = np.hstack((src_emb, sus_emb))
    pred = lr_model.predict(combined_emb)[0]
    return "Plagiarized" if pred == 1 else "Original"

iface = gr.Interface(
    fn=predict_plagiarism,
    inputs=[
        gr.Textbox(lines=5, label="Source Text"),
        gr.Textbox(lines=5, label="Suspect Text")
    ],
    outputs="text",
    title="Plagiarism Detector (Sentence-BERT)",
    description="Detect plagiarism using semantic similarity from Sentence Transformers (MiniLM)."
)

iface.launch(debug=True)