In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

# Load the new dataset
dataset = pd.read_csv('new_processed_dataset.csv')
print(dataset.head())

# Drop rows with NaN values in the 'tweet' column
dataset = dataset.dropna(subset=['tweet'])

# Define the input features and target variable
features = dataset['tweet']
target = dataset['class']

# Split the data into training and testing sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Function to display evaluation metrics
def display_metrics(actual, predicted):
    print("Accuracy:", accuracy_score(actual, predicted))
    print("Precision:", precision_score(actual, predicted, average='weighted'))
    print("Recall:", recall_score(actual, predicted, average='weighted'))
    print("F1 Score:", f1_score(actual, predicted, average='weighted'))


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Create a pipeline for text classification with CountVectorizer and LogisticRegression
text_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(binary=True, max_features=1000)),
    ('classifier', LogisticRegression())
])

# Fit the pipeline on the training data
text_pipeline.fit(features_train, target_train)

# Predict on the test data
predictions = text_pipeline.predict(features_test)

# Output the evaluation metrics
print("Evaluation using Count Vectorization and Logistic Regression:")
print("Accuracy Score:", accuracy_score(target_test, predictions))
print("Classification Report:\n", classification_report(target_test, predictions))

# Function to display additional metrics
def show_metrics(actual, predicted):
    print("Accuracy:", accuracy_score(actual, predicted))
    print("Precision:", precision_score(actual, predicted, average='weighted'))
    print("Recall:", recall_score(actual, predicted, average='weighted'))
    print("F1 Score:", f1_score(actual, predicted, average='weighted'))

# Display additional metrics
show_metrics(target_test, predictions)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Encoding Pipeline
pipeline_tfidf = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=1000)),
    ('classifier', LogisticRegression())
])

# Train and evaluate the model
pipeline_tfidf.fit(X_train, y_train)
y_pred_tfidf = pipeline_tfidf.predict(X_test)
print("TF-IDF Encoding")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report:\n", classification_report(y_test, y_pred_tfidf))
print_metrics(y_test, y_pred_tfidf)

In [None]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.model = None

    def fit(self, X, y=None):
        tokenized_X = [tweet.split() for tweet in X]
        self.model = Word2Vec(sentences=tokenized_X, vector_size=self.vector_size, window=self.window, min_count=self.min_count)
        return self

    def transform(self, X):
        def get_word2vec_features(text):
            words = text.split()
            feature_vector = np.mean([self.model.wv[word] for word in words if word in self.model.wv] or [np.zeros(self.vector_size)], axis=0)
            return feature_vector
        
        return np.array([get_word2vec_features(tweet) for tweet in X])

# Word2Vec Encoding Pipeline
pipeline_w2v = Pipeline([
    ('word2vec', Word2VecTransformer(vector_size=100)),  # We don't set max_features for Word2Vec
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train and evaluate the model
pipeline_w2v.fit(X_train, y_train)
y_pred_w2v = pipeline_w2v.predict(X_test)
print("Word2Vec Encoding")
print("Accuracy:", accuracy_score(y_test, y_pred_w2v))
print("Classification Report:\n", classification_report(y_test, y_pred_w2v))
print_metrics(y_test, y_pred_w2v)

In [None]:
# Term Frequency Encoding Pipeline
pipeline_tf = Pipeline([
    ('vectorizer', CountVectorizer(max_features=1000)),
    ('classifier', LogisticRegression())
])

# Train and evaluate the model
pipeline_tf.fit(X_train, y_train)
y_pred_tf = pipeline_tf.predict(X_test)
print("Term Frequency Encoding")
print("Accuracy:", accuracy_score(y_test, y_pred_tf))
print("Classification Report:\n", classification_report(y_test, y_pred_tf))
print_metrics(y_test, y_pred_tf)