# Preparation

## Libs

In [1]:
# import all the necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import re
import time
import datetime
import random

# import the necessary libraries for the machine learning models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# import the necessary libraries for the deep learning models using torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load Model

In [16]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text2text-generation", model="indonlp/cendol-mt5-xl-inst")

# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("indonlp/cendol-mt5-xl-inst")
model = AutoModelForSeq2SeqLM.from_pretrained("indonlp/cendol-mt5-xl-inst")

In [None]:

# Define class labels
class_labels = [
    "non_cyberbullying",
    "cyberbullying"
]

# Function to classify text
def classify_text_with_prompt(text):
    # Create a prompt with all possible labels
    prompt = f"anda seorang ahli bahasa, bayangkan anda dapat membedakan jenis-jenis teks, klasifikasikan teks berikut dari kategori berikut: {', '.join(class_labels)}.\nText: {text}\nLabel:"

    
    # Tokenize and encode the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Generate prediction
    outputs = model.generate(inputs["input_ids"], max_length=10, num_return_sequences=1)
    
    # Decode the model output to get the predicted label
    predicted_label = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    
    # Find the class index
    if predicted_label in class_labels:
        class_index = class_labels.index(predicted_label)
        print(class_index, predicted_label)
        return class_index, predicted_label
    
    return None, "Unknown class"

## Dataframe Prep

In [33]:
# assign df to cyberbullyingdata.xlsx

df = pd.read_excel('cyberbullyingdata.xlsx')
df

Unnamed: 0,No.,Nama Instagram,Komentar,Kategori,Tanggal Posting,Nama Akun IG Artis/Selebgram,Unnamed: 6,Unnamed: 7
0,1,@delliananda,"""Kaka tidur yaa, udah pagi, gaboleh capek2""",Non-bullying,14 Oktober 2019,@isyanasarasvati,,
1,2,@fenninbl,"""makan nasi padang aja begini badannya""",Non-bullying,14 Oktober 2019,@isyanasarasvati,,
2,3,@abdurahmanshq,"""yang aku suka dari dia adalah selalu cukur je...",Bullying,14 Oktober 2019,@isyanasarasvati,,
3,4,@najla.yoo,"""Hai kak Isyana aku ngefans banget sama kak Is...",Non-bullying,14 Oktober 2019,@isyanasarasvati,,
4,5,@dessy_______,"""Manusia apa bidadari sih herann deh cantik te...",Non-bullying,14 Oktober 2019,@isyanasarasvati,,
...,...,...,...,...,...,...,...,...
645,646,@_sigesrek,"""aku memutuskan untuk menjadi fans isyana. gil...",Non-bullying,13 Februari 2021,@isyanasarasvati,,
646,647,@safronlux.id,"""AMZING ISYANAA!! Jujur aku amazed banget deng...",Non-bullying,13 Februari 2021,@isyanasarasvati,,
647,648,@rikzikmuktyana,"""paling ngiri liat orang keren maen alat musik...",Non-bullying,13 Februari 2021,@isyanasarasvati,,
648,649,@antoniusbennys,"""Sampe ga bisa berkata2 lagi buat isyana, sang...",Non-bullying,13 Februari 2021,@isyanasarasvati,,


In [None]:
# apply classify_text_with_prompt function to the Komentar column, make it with tdqm to see the progress bar
from tqdm import tqdm
tqdm.pandas()

df['label_index'], df['predicted_label'] = zip(*df['Komentar'].progress_apply(classify_text_with_prompt))

100%|██████████| 650/650 [31:24<00:00,  2.90s/it]


In [35]:
# print the population of the data
print(df['predicted_label'].value_counts())
print(df['Kategori'].value_counts())

predicted_label
non_cyberbullying    619
Unknown class         30
cyberbullying          1
Name: count, dtype: int64
Kategori
Non-bullying    325
Bullying        325
Name: count, dtype: int64
