In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
import re
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import numpy as np
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
nltk.download('wordnet')

In [3]:
df = pd.read_csv('../cyber-troll.csv')

In [None]:
df.shape

In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
def preprocess_text(tokens):
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    processed_text = " ".join(tokens) 
    processed_text = " ".join(processed_text.split())

    return processed_text

In [9]:
def tweet_tokenize(texto):
  tokenizer = TweetTokenizer()
  tokens = tokenizer.tokenize(texto.lower())
  tokens = [token for token in tokens]

  return tokens

In [10]:
def calculate_metrics(y_true, y_pred):
    tp = sum((y_true == 1) & (y_pred == 1))
    tn = sum((y_true == 0) & (y_pred == 0))
    fp = sum((y_true == 0) & (y_pred == 1))
    fn = sum((y_true == 1) & (y_pred == 0))

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return accuracy, precision, recall, f1

In [11]:
df['content'] = df['content'].apply(preprocess_text)

In [12]:
precisions = []
recalls = []
f1_scores = []
accuracies = []

In [13]:
train_data, test_data = train_test_split(df, test_size=2000, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tweet_tokenize, sublinear_tf=True, smooth_idf=True)
X_train = vectorizer.fit_transform(train_data['content'])
y_train = train_data['label']
X_test = vectorizer.transform(test_data['content'])
y_test = test_data['label']

In [15]:
base_classifier = ExtraTreeClassifier(random_state=42)

In [16]:
bagging_classifier = BaggingClassifier(estimator=base_classifier, n_estimators=10, random_state=42)

In [17]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
for train_index, val_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    bagging_classifier.fit(X_train_fold, y_train_fold)

    val_predictions = bagging_classifier.predict(X_val_fold)

    accuracy, precision, recall, f1 = calculate_metrics(y_val_fold.values, val_predictions)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f'Fold Accuracy: {accuracy}')
    print(f'Fold Precision: {precision}')
    print(f'Fold Recall: {recall}')
    print(f'Fold F1-score: {f1}')
    print('-------------------------------------')

In [None]:
mean_accuracy = np.mean(accuracies)
print(f'Mean Accuracy: {mean_accuracy}')
mean_precision = np.mean(precisions)
print(f'Mean Precision: {mean_precision}')
mean_recall = np.mean(recalls)
print(f'Mean Recall: {mean_recall}')
mean_f1 = np.mean(f1_scores)
print(f'Mean F1-score: {mean_f1}')

In [20]:
test_predictions = bagging_classifier.predict(X_test)

test_accuracy, test_precision, test_recall, test_f1 = calculate_metrics(y_test.values, test_predictions)

In [None]:
print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1-score: {test_f1}')