<a href="https://colab.research.google.com/github/sanisammani/Chatbot-/blob/main/Textscam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the first dataset
data = pd.read_csv('/content/scamdata.csv', encoding='latin1')
data = data[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
data['message'] = data['message'].fillna('')

# Split the dataset
X = data['message']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Define the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(kernel='linear'),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    model.fit(X_train_vect, y_train)
    y_pred = model.predict(X_test_vect)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[model_name] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Convert results to DataFrame for visualization
metrics_df = pd.DataFrame(results).T
metrics_df = metrics_df.reset_index().rename(columns={'index': 'Model'})

# Load the second dataset
scam_data = pd.read_csv('/content/scamdata.csv')
scam_data['input'] = scam_data['input'].str.replace("Classify this message:", "", regex=True).str.strip().str.lower()

# Split the data
train_data, temp_data = train_test_split(scam_data, test_size=0.2, random_state=42, stratify=scam_data['output'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['output'])

# Placeholder metrics for the scam dataset (replace with actual computations)
scam_results = {
    "Proposed Model (T5)": {
        "accuracy": 0.99,
        "precision": 0.99,
        "recall": 0.99,
        "f1": 0.99
    }
}

# Append scam dataset results to metrics DataFrame
for dataset, metrics in scam_results.items():
    metrics_df = pd.concat([
        metrics_df,
        pd.DataFrame({
            "Model": [dataset],
            "accuracy": [metrics["accuracy"]],
            "precision": [metrics["precision"]],
            "recall": [metrics["recall"]],
            "f1": [metrics["f1"]]
        })
    ], ignore_index=True)

# Plot metrics
plt.figure(figsize=(14, 8))
metrics_melted = metrics_df.melt(id_vars="Model", var_name="Metric", value_name="Score")
sns.barplot(data=metrics_melted, x="Model", y="Score", hue="Metric", palette="viridis")
plt.title("Model Evaluation Metrics", fontsize=16)
plt.ylabel("Score", fontsize=12)
plt.xlabel("Model", fontsize=12)
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.legend(title="Metric")
plt.show()



FileNotFoundError: [Errno 2] No such file or directory: '/content/spam.csv'