In [None]:
!pip install fasttext-wheel

In [None]:
!git clone https://github.com/facebookresearch/fastText.git
%cd fastText
!pip install .

In [5]:
import numpy as np, pandas as pd
import fasttext
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
import os
import string
import nltk
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import re
from collections import defaultdict
from nltk.stem import WordNetLemmatizer

In [6]:
dataset_path = os.path.join(os.environ["USERPROFILE"], "Downloads", "dataset.txt")

In [7]:
with open(dataset_path, 'r') as file:
    data = file.readlines()

In [None]:
nltk.download('wordnet')

In [None]:
print(data)

In [10]:
labels = []
texts = []
precisions = []
p = []
r = []
recalls = []
predicted_labels = []
real_labels = []
tokenizer = TweetTokenizer()
precision_dict = defaultdict(list)
recall_dict = defaultdict(list)
lemmatizer = WordNetLemmatizer()

In [12]:
def preprocess_text(text):
    tokens = tokenizer.tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    processed_text = " ".join(tokens)
    processed_text = " ".join(processed_text.split())

    return processed_text


In [13]:
np.random.seed(42)
np.random.shuffle(data)

In [14]:
train_data = data[:18001]

In [15]:
test_data = data[18001:]

In [16]:
with open('cyber.train', 'w') as file:
    file.writelines(train_data)

with open('cyber.test', 'w') as file:
    file.writelines(test_data)

In [17]:
np.shape(train_data)

(18001,)

In [18]:
len(test_data)

2000

In [19]:
with open('cyber.train', 'r') as file:
    for line in file:
        label, text = line.strip().split(' ', 1)
        labels.append(label)
        texts.append(preprocess_text(text))

In [21]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
for i, (train_index, test_index) in enumerate(skf.split(texts, labels)):

    train_file = f"train_fold_{i}.txt"
    test_file = f"test_fold_{i}.txt"

    train_texts = [texts[i] for i in train_index]
    train_labels = [labels[i] for i in train_index]
    test_texts = [texts[i] for i in test_index]
    test_labels = [labels[i] for i in test_index]

    with open(train_file, 'w', encoding="utf-8") as train_f:
        for label, text in zip(train_labels, train_texts):
            train_f.write(f"{label} {text}\n")

    with open(test_file, 'w', encoding="utf-8") as test_f:
        for label, text in zip(test_labels, test_texts):
            test_f.write(f"{label} {text}\n")
   
    model = fasttext.train_supervised(input=train_file, epoch=20, lr=0.4, wordNgrams=4, minCount=2, ws=7)

    result = model.test(test_file)
    print(f"Precision and recall in fold {i}: {result[1]} - {result[2]}")

    precisions.append(result[1])
    recalls.append(result[2])

    result2 = model.test_label(test_file)

    for label, metrics in result2.items():
        print(f'Label: {label} - Metrics: {metrics}')
        p.append(metrics['precision'])
        r.append(metrics['recall'])
        precision_dict[label].append(metrics['precision'])
        recall_dict[label].append(metrics['recall'])

In [None]:
avg_precision_per_label = {}
avg_recall_per_label = {}

for label, precisions in precision_dict.items():
    avg_precision_per_label[label] = sum(precisions) / len(precisions)

for label, recalls in recall_dict.items():
    avg_recall_per_label[label] = sum(recalls) / len(recalls)

for label, avg_precision in avg_precision_per_label.items():
    print(f'Average Precision for {label}: {avg_precision}')

for label, avg_recall in avg_recall_per_label.items():
    print(f'Average Recall for {label}: {avg_recall}')

In [None]:
p

In [None]:
mean_p = np.mean(p)
print(f"Average precision: {mean_p}")

mean_r = np.mean(r)
print(f"Average recall: {mean_r}")

In [None]:
mean_precision = np.mean(precisions)
print(f"Average precision: {mean_precision}")
mean_recall = np.mean(recalls)
print(f"Average recall: {mean_recall}")

In [29]:
with open('cyber.test', 'r') as file:
    for line in file:
        label_test, text_test = line.strip().split(' ', 1)
        real_labels.append(label_test)
        prediction = model.predict(preprocess_text(text_test))
        predicted_labels.append(prediction[0][0])

In [None]:
print('Real labels: ' + str(len(real_labels)))
print('Predicted labels: ' + str(len(predicted_labels)))

In [None]:
accuracy = accuracy_score(real_labels, predicted_labels)
precision = precision_score(real_labels, predicted_labels, average='weighted')
recall = recall_score(real_labels, predicted_labels, average='weighted')
f1 = f1_score(real_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

In [None]:
confusion_matrix(real_labels, predicted_labels)

In [33]:
tp, fp, fn, tn = confusion_matrix(real_labels, predicted_labels).ravel()

In [None]:
precision = tp / (tp + fp)
print(precision)

In [None]:
recall = tp / (tp + fn)
print(recall)

In [None]:
accuracy = (tp + tn) / (tp + tn + fp + fn)
print(accuracy)

In [None]:
f1_score = 2 * (precision * recall) / (precision + recall)
print(f1_score)

In [38]:
labels = []
texts = []
precisions = []
recalls = []
predicted_labels = []
real_labels = []
p = []
r = []