# Import Package

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw_indo = stopwords.words("indonesian") + list(punctuation)

# Import Data

In [2]:
df = pd.read_csv("Dataset/dataset_komentar_instagram_cyberbullying.csv", index_col="Id")
df.head()

Unnamed: 0_level_0,Sentiment,Instagram Comment Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,negative,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...
2,negative,Geblek lo tata...cowo bgt dibela2in balikan......
3,negative,Kmrn termewek2 skr lengket lg duhhh kok labil ...
4,negative,"Intinya kalau kesel dengan ATT nya, gausah ke ..."
5,negative,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


# Data Preparation

In [3]:
# Mengubah kalimat menjadi huruf kecil semua

def lower(text):
    result = text.lower()
    return result

In [4]:
df["Instagram Comment Text"] = df["Instagram Comment Text"].apply(lower)

In [5]:
df.head()

Unnamed: 0_level_0,Sentiment,Instagram Comment Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,negative,<username> tolol!! gak ada hubungan nya kegug...
2,negative,geblek lo tata...cowo bgt dibela2in balikan......
3,negative,kmrn termewek2 skr lengket lg duhhh kok labil ...
4,negative,"intinya kalau kesel dengan att nya, gausah ke ..."
5,negative,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


In [6]:
df.count()

Sentiment                 400
Instagram Comment Text    400
dtype: int64

In [7]:
# Mengubah nilai sentiment menjadi angka 0 dan 1

df["Sentiment"] = df["Sentiment"].replace("negative",0)
df["Sentiment"] = df["Sentiment"].replace("positive",1)

In [8]:
df.head()

Unnamed: 0_level_0,Sentiment,Instagram Comment Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,<username> tolol!! gak ada hubungan nya kegug...
2,0,geblek lo tata...cowo bgt dibela2in balikan......
3,0,kmrn termewek2 skr lengket lg duhhh kok labil ...
4,0,"intinya kalau kesel dengan att nya, gausah ke ..."
5,0,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


In [9]:
df.Sentiment.value_counts()

1    200
0    200
Name: Sentiment, dtype: int64

# Dataset Splitting

In [10]:
MODEL = []
TEST = []

def eval(model,test1):
    MODEL.append(model)
    TEST.append(round(test1,2))

In [11]:
X = df["Instagram Comment Text"]
y = df.Sentiment

# Membagi dataset menjadi dengan proporsi 80% data train dan 20% data test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Algorithm: Logistic Regresion

In [12]:
#Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo, ngram_range=(1,3))),
    ('algo',LogisticRegression())
])


#Hyperparameter Tuning
parameter = {
    "algo__fit_intercept" : [True,False],
    "algo__C" : range(1,5,1)
}

#Training
model_logistic = GridSearchCV(estimator=pipeline, param_grid=parameter, cv=5, n_jobs=-1, verbose=1)
model_logistic.fit(X_train,y_train)
model_logistic.score(X_train,y_train), model_logistic.best_score_, model_logistic.score(X_test,y_test)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   13.1s finished
  'stop_words.' % sorted(inconsistent))


(1.0, 0.871875, 0.9125)

### Evaluation: Logistic Regresion

In [13]:
y_pred = model_logistic.predict(X_test)

In [14]:
print(classification_report(y_test, y_pred))
eval("Logistic Regression",model_logistic.score(X_test,y_test))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92        44
           1       0.87      0.94      0.91        36

    accuracy                           0.91        80
   macro avg       0.91      0.92      0.91        80
weighted avg       0.92      0.91      0.91        80



In [15]:
X_predict = [
    ["kamu sangat jelek sampe - sampe mau muntah ngeliatnya!!!"], #negative Comment
    ["Kamu hari ini terlihat cantik banget"] #positive Comment
]

In [16]:
model_logistic.predict(X_predict[0]) # Predict Negative Comment

array([0], dtype=int64)

In [17]:
model_logistic.predict(X_predict[1]) # Predict positive Comment

array([1], dtype=int64)

# Algorithm: KNN

In [18]:
#Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo, ngram_range=(1,3))),
    ('algo',KNeighborsClassifier())
])

#Hyperparameter Tunning
parameter = {
    "algo__n_neighbors" : range(1,51,1),
    "algo__weights" : ["distance","uniform"],
    "algo__p" : [1,2]
}

#Training
model_knn = GridSearchCV(estimator=pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=1)
model_knn.fit(X_train,y_train)
model_knn.score(X_train,y_train), model_knn.best_score_, model_knn.score(X_test,y_test)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   34.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  1.7min finished
  'stop_words.' % sorted(inconsistent))


(1.0, 0.8500264503614883, 0.9125)

### Evaluation: KNN

In [19]:
y_pred = model_knn.predict(X_test)

In [20]:
print(classification_report(y_test, y_pred))
eval(" K-Nearest Neighbour (KNN)",model_knn.score(X_test,y_test))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92        44
           1       0.87      0.94      0.91        36

    accuracy                           0.91        80
   macro avg       0.91      0.92      0.91        80
weighted avg       0.92      0.91      0.91        80



In [21]:
X_predict = [
    ["kamu sangat jelek sampe - sampe mau muntah ngeliatnya!!!"], #negative Comment
    ["Kamu hari ini terlihat cantik banget"] #positive comment
]

In [22]:
model_knn.predict(X_predict[0]) # Predict Negative Comment

array([0], dtype=int64)

In [23]:
model_knn.predict(X_predict[1]) # Predict positive Comment

array([1], dtype=int64)

# Algorithm: SVM

In [None]:
#Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo, ngram_range=(1,3))),
    ('algo',SVC(max_iter=500))
])

#Hyperparameter Tunning
parameter = {
    "algo__kernel" : ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
    "algo__C" : [0.001, 0.01, 0.1, 1, 10],
    "algo__gamma" : ['scale',10, 5, 1, 0.1]
}

#Training
model_svm = GridSearchCV(estimator=pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=1)
model_svm.fit(X_train,y_train)
model_svm.score(X_train,y_train), model_svm.best_score_, model_svm.score(X_test,y_test)

Fitting 3 folds for each of 125 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   29.7s


### Evaluation: SVM

In [None]:
y_pred = model_svm.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
eval("Support Vector Machine (SVM)",model_svm.score(X_test,y_test))

In [None]:
X_predict = [
    ["kamu sangat jelek sampe - sampe mau muntah ngeliatnya!!!"], #negative Comment
    ["Kamu hari ini terlihat cantik banget"] #positive Comment
]

In [None]:
model_svm.predict(X_predict[0]) # Predict Negative Comment

In [None]:
model_svm.predict(X_predict[1]) # Predict positive Comment

# Kesimpulan 

In [None]:
print("\t\t\tTest Accuracy")
results2 = pd.DataFrame({ 'Model': MODEL,
                         'Test Accuracy': TEST})

results2.sort_values(by='Test Accuracy',ascending=False)

Akurasi yang dihasilkan dengan menggunakan Algoritma Logistic Regresion, KNN, SVM tidak jauh berbeda, namun akurasi yang terbesar adalah SVM dengan score 92%, selain akurasi nilai recal menggunakan algoritma SVM juga yang terbesar yaitu berada di angka 0: 91% dan 1: 94%. itu artinya keberhasilan model memprediksi data yang berlabel 0 (Negative) sebesar 91% dan data yang berlabel 1 (Positive) sebesar 94%, jadi bisa di katakan model yang terbaik untuk kasus ini adalah `Support Vector Machine (SVM)`