In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
import re
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import numpy as np
from nltk.stem import WordNetLemmatizer
import nltk

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Maricondi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df = pd.read_csv('../cyber-troll.csv')

In [5]:
df.shape

(20001, 2)

In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
def preprocess_text(tokens):
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    processed_text = " ".join(tokens) 
    processed_text = " ".join(processed_text.split())

    return processed_text

In [9]:
def tweet_tokenize(texto):
  tokenizer = TweetTokenizer()
  tokens = tokenizer.tokenize(texto.lower())
  tokens = [token for token in tokens]

  return tokens

In [10]:
def calculate_metrics(y_true, y_pred):
    tp = sum((y_true == 1) & (y_pred == 1))
    tn = sum((y_true == 0) & (y_pred == 0))
    fp = sum((y_true == 0) & (y_pred == 1))
    fn = sum((y_true == 1) & (y_pred == 0))

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return accuracy, precision, recall, f1

In [11]:
df['content'] = df['content'].apply(preprocess_text)

In [12]:
precisions = []
recalls = []
f1_scores = []
accuracies = []

In [13]:
train_data, test_data = train_test_split(df, test_size=2000, random_state=42)

In [14]:
vectorizer = TfidfVectorizer(tokenizer=tweet_tokenize, sublinear_tf=True, smooth_idf=True)
X_train = vectorizer.fit_transform(train_data['content'])
y_train = train_data['label']
X_test = vectorizer.transform(test_data['content'])
y_test = test_data['label']



In [15]:
base_classifier = ExtraTreeClassifier(random_state=42)

In [16]:
bagging_classifier = BaggingClassifier(estimator=base_classifier, n_estimators=10, random_state=42)

In [17]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [18]:
for train_index, val_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    bagging_classifier.fit(X_train_fold, y_train_fold)

    val_predictions = bagging_classifier.predict(X_val_fold)

    accuracy, precision, recall, f1 = calculate_metrics(y_val_fold.values, val_predictions)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f'Acurácia do fold: {accuracy}')
    print(f'Precisão do fold: {precision}')
    print(f'Revocação do fold: {recall}')
    print(f'F1-score do fold: {f1}')
    print('-------------------------------------')

Acurácia do fold: 0.9250416435313714
Precisão do fold: 0.902127659574468
Revocação do fold: 0.905982905982906
F1-score do fold: 0.9040511727078892
-------------------------------------
Acurácia do fold: 0.9266666666666666
Precisão do fold: 0.9214814814814815
Revocação do fold: 0.8873038516405135
F1-score do fold: 0.9040697674418604
-------------------------------------
Acurácia do fold: 0.9316666666666666
Precisão do fold: 0.9200581395348837
Revocação do fold: 0.9029957203994294
F1-score do fold: 0.9114470842332614
-------------------------------------
Acurácia do fold: 0.9222222222222223
Precisão do fold: 0.914327917282127
Revocação do fold: 0.8830242510699001
F1-score do fold: 0.8984034833091438
-------------------------------------
Acurácia do fold: 0.9338888888888889
Precisão do fold: 0.924198250728863
Revocação do fold: 0.9044222539229672
F1-score do fold: 0.9142033165104543
-------------------------------------
Acurácia do fold: 0.9355555555555556
Precisão do fold: 0.917261055634

In [19]:
mean_accuracy = np.mean(accuracies)
print(f'Acurácia média: {mean_accuracy}')
mean_precision = np.mean(precisions)
print(f'precisão média: {mean_precision}')
mean_recall = np.mean(recalls)
print(f'revocação média: {mean_recall}')
mean_f1 = np.mean(f1_scores)
print(f'F1 média: {mean_f1}')

Acurácia média: 0.9295041643531372
precisão média: 0.9132531207490866
revocação média: 0.9051987189647674
F1 média: 0.9091161365598188


In [20]:
test_predictions = bagging_classifier.predict(X_test)

test_accuracy, test_precision, test_recall, test_f1 = calculate_metrics(y_test.values, test_predictions)

In [21]:
print(f'Acurácia no conjunto de teste: {test_accuracy}')
print(f'Precisão no conjunto de teste: {test_precision}')
print(f'Revocação no conjunto de teste: {test_recall}')
print(f'F1-score no conjunto de teste: {test_f1}')

Acurácia no conjunto de teste: 0.932
Precisão no conjunto de teste: 0.9178082191780822
Revocação no conjunto de teste: 0.9132589838909542
F1-score no conjunto de teste: 0.9155279503105591
