In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np

# Load and preprocess data
dataset = pd.read_csv('new_processed_dataset.csv')
print(dataset.head())

# Remove duplicates and handle missing values
dataset.drop_duplicates(inplace=True)
dataset.dropna(inplace=True)

# Define features (X) and target (y)
features = dataset['tweet']
target = dataset['class']

# Split the dataset into training and testing sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Function to print evaluation metrics
def display_metrics(actual, predicted):
    print("Accuracy:", accuracy_score(actual, predicted))
    print("Precision:", precision_score(actual, predicted, average='weighted'))
    print("Recall:", recall_score(actual, predicted, average='weighted'))
    print("F1 Score:", f1_score(actual, predicted, average='weighted'))

# Instantiate a Multinomial Naive Bayes classifier pipeline with CountVectorizer
nb_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(max_features=5000)),
    ('classifier', MultinomialNB())
])

# Train the pipeline on the training data
nb_pipeline.fit(features_train, target_train)

# Predict on the test data
predictions = nb_pipeline.predict(features_test)

# Display evaluation metrics
print("Evaluation Results:")
display_metrics(target_test, predictions)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Create a pipeline for text classification with CountVectorizer and Multinomial Naive Bayes
text_pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer(binary=True, max_features=1000)),
    ('naive_bayes', MultinomialNB())
])

# Train the pipeline using the training dataset
text_pipeline.fit(features_train, target_train)

# Generate predictions on the test set
test_predictions = text_pipeline.predict(features_test)

# Print the classification report
print("Text Classification Evaluation with Naive Bayes:")
print("Classification Report:\n", classification_report(target_test, test_predictions))

# Define a function to print detailed performance metrics
def print_performance_metrics(true_labels, predicted_labels):
    print("Accuracy Score:", accuracy_score(true_labels, predicted_labels))
    print("Precision Score:", precision_score(true_labels, predicted_labels, average='weighted'))
    print("Recall Score:", recall_score(true_labels, predicted_labels, average='weighted'))
    print("F1 Score:", f1_score(true_labels, predicted_labels, average='weighted'))

# Output the detailed performance metrics
print_performance_metrics(target_test, test_predictions)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Create a pipeline for text classification using TF-IDF and Multinomial Naive Bayes
tfidf_pipeline = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer(max_features=1000)),
    ('naive_bayes_classifier', MultinomialNB())
])

# Train the pipeline on the training data
tfidf_pipeline.fit(features_train, target_train)

# Predict the labels for the test set
predicted_labels_tfidf = tfidf_pipeline.predict(features_test)

# Print the classification report for TF-IDF encoding with Naive Bayes
print("Evaluation of TF-IDF Encoding with Naive Bayes Classifier:")
print("Classification Report:\n", classification_report(target_test, predicted_labels_tfidf))

# Function to display performance metrics
def show_performance_metrics(true_values, predicted_values):
    print("Accuracy:", accuracy_score(true_values, predicted_values))
    print("Precision:", precision_score(true_values, predicted_values, average='weighted'))
    print("Recall:", recall_score(true_values, predicted_values, average='weighted'))
    print("F1 Score:", f1_score(true_values, predicted_values, average='weighted'))

# Display additional metrics for the TF-IDF pipeline results
show_performance_metrics(target_test, predicted_labels_tfidf)


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.metrics import classification_report

# Custom Transformer for Word2Vec
class CustomWord2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.word2vec_model = None

    def fit(self, X, y=None):
        tokenized_texts = [text.split() for text in X]
        self.word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=self.vector_size, window=self.window, min_count=self.min_count)
        return self

    def transform(self, X):
        def compute_feature_vector(text):
            tokens = text.split()
            vectors = [self.word2vec_model.wv[token] for token in tokens if token in self.word2vec_model.wv]
            if not vectors:
                return np.zeros(self.vector_size)
            return np.mean(vectors, axis=0)
        
        return np.array([compute_feature_vector(text) for text in X])

# Define the pipeline with the Word2Vec transformer and Gaussian Naive Bayes
w2v_pipeline = Pipeline([
    ('word2vec_transformer', CustomWord2VecTransformer(vector_size=100)),  # Reduced vector size for quicker processing
    ('naive_bayes_classifier', GaussianNB())
])

# Fit the pipeline on the training data
w2v_pipeline.fit(X_train, y_train)

# Predict the labels for the test set
predicted_labels_w2v = w2v_pipeline.predict(X_test)

# Print the classification report for the Word2Vec model
print("Word2Vec Encoding with Naive Bayes Classifier:")
print("Classification Report:\n", classification_report(y_test, predicted_labels_w2v))

# Function to display performance metrics
def display_metrics(true_labels, predicted_labels):
    print("Accuracy:", accuracy_score(true_labels, predicted_labels))
    print("Precision:", precision_score(true_labels, predicted_labels, average='weighted'))
    print("Recall:", recall_score(true_labels, predicted_labels, average='weighted'))
    print("F1 Score:", f1_score(true_labels, predicted_labels, average='weighted'))

# Display detailed performance metrics
display_metrics(y_test, predicted_labels_w2v)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Define the pipeline for Term Frequency encoding with Naive Bayes
tf_pipeline = Pipeline([
    ('tf_vectorizer', CountVectorizer(max_features=1000)),  # Convert text data into term frequency features
    ('naive_bayes', MultinomialNB())  # Classifier
])

# Fit the pipeline on the training data
tf_pipeline.fit(X_train, y_train)

# Predict the labels for the test data
predicted_labels_tf = tf_pipeline.predict(X_test)

# Print the classification report for the Term Frequency encoding model
print("Term Frequency Encoding with Naive Bayes Classifier:")
print("Classification Report:\n", classification_report(y_test, predicted_labels_tf))

# Function to display additional performance metrics
def display_performance_metrics(true_labels, predicted_labels):
    print("Accuracy:", accuracy_score(true_labels, predicted_labels))
    print("Precision:", precision_score(true_labels, predicted_labels, average='weighted'))
    print("Recall:", recall_score(true_labels, predicted_labels, average='weighted'))
    print("F1 Score:", f1_score(true_labels, predicted_labels, average='weighted'))

# Show detailed performance metrics
display_performance_metrics(y_test, predicted_labels_tf)
