In [8]:
!pip install spacy

Collecting spacy
  Obtaining dependency information for spacy from https://files.pythonhosted.org/packages/d6/9e/8afc618cfed4b5dc602b11754d4d9193a268439704defae301bffca7f04c/spacy-3.6.1-cp311-cp311-win_amd64.whl.metadata
  Downloading spacy-3.6.1-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.4-py3-none-any.whl (11 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.9-cp311-cp311-win_amd64.whl (18 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.7-cp311-cp311-win_amd64.whl (28 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.8-cp311-cp311-win_amd64.whl (91 kB)
     ---------------------------------------- 0.0/91.9 kB ? eta -:--:--
     ---------------------------------------- 91.9/91.9 kB 5.4 MB/s eta 0:00:00
Colle

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import spacy

# Load the spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# List of CSV file paths, each containing comments of a YouTube channel
csv_files = [
    'data/Youtube01-Psy.csv',
    'data/Youtube02-KatyPerry.csv',
    'data/Youtube03-LMFAO.csv',
    'data/Youtube04-Eminem.csv',
    'data/Youtube05-Shakira.csv'
]

# Initialize empty lists to store classifier results
all_accuracies = []
all_precisions = []
all_recalls = []
all_f1_scores = []

for csv_file in csv_files:
    # Load data from the CSV file
    data = pd.read_csv(csv_file)
    
    # Rename the columns
    data = data.rename(columns={'CONTENT': 'text', 'CLASS': 'label'})
    
    # Preprocessing: Filter columns and clean data as needed
    data = data[['text', 'label']].dropna()

    # Tokenize and preprocess the text data using spaCy
    def preprocess_text(text):
        doc = nlp(text)
        return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

    data['text'] = data['text'].apply(preprocess_text)

    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(data['text'].values)
    y = data['label'].values

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define classifiers and ensembles
    random_forest = RandomForestClassifier()
    svm = SVC()
    gradient_boosting = GradientBoostingClassifier()
    ensemble_classifier = VotingClassifier(estimators=[
        ('Random Forest', random_forest),
        ('SVM', svm),
        ('Gradient Boosting', gradient_boosting)
    ], voting='hard')

    # Train each classifier and ensemble
    classifiers = [random_forest, svm, gradient_boosting, ensemble_classifier]

    for classifier in classifiers:
        classifier.fit(X_train, y_train)

        # Make predictions on the test set
        predictions = classifier.predict(X_test)

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions, pos_label=1)
        recall = recall_score(y_test, predictions, pos_label=1)
        f1 = f1_score(y_test, predictions, pos_label=1)

        # Store results in lists
        all_accuracies.append(accuracy)
        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1_scores.append(f1)

        # Print results for each classifier
        print(f"Classifier: {classifier.__class__.__name__}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision (Spam): {precision}")
        print(f"Recall (Spam): {recall}")
        print(f"F1 Score (Spam): {f1}")

        # Plot confusion matrix
        cm = confusion_matrix(y_test, predictions)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Spam', 'Spam'], yticklabels=['Not Spam', 'Spam'])
        plt.xlabel('Predicted Labels')
        plt.ylabel('True Labels')
        plt.title('Confusion Matrix')
        plt.show()

        print("=" * 50)

# After processing all CSV files, you can analyze the combined results as needed.


ModuleNotFoundError: No module named 'spacy'

In [None]:
({0: "Not Spam", 1: "Spam Comment"})