# Import Package

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw_indo = stopwords.words("indonesian") + list(punctuation)

# Import Data

In [2]:
df = pd.read_csv("Dataset/dataset_komentar_instagram_cyberbullying.csv", index_col="Id")
df.head()

Unnamed: 0_level_0,Sentiment,Instagram Comment Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,negative,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...
2,negative,Geblek lo tata...cowo bgt dibela2in balikan......
3,negative,Kmrn termewek2 skr lengket lg duhhh kok labil ...
4,negative,"Intinya kalau kesel dengan ATT nya, gausah ke ..."
5,negative,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


In [3]:
df.count()

Sentiment                 400
Instagram Comment Text    400
dtype: int64

# Dataset Splitting

In [4]:
X = df["Instagram Comment Text"]
y = df.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Algorithm: Logistic Regresion

In [5]:
#Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ('algo',LogisticRegression())
])


#Hyperparameter Tuning
parameter = {
    "algo__fit_intercept" : [True,False],
    "algo__C" : range(1,5,1)
}

#Training
model = GridSearchCV(estimator=pipeline, param_grid=parameter, cv=5, n_jobs=-1, verbose=1)
model.fit(X_train,y_train)
model.score(X_train,y_train), model.best_score_, model.score(X_test,y_test)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   12.4s finished
  'stop_words.' % sorted(inconsistent))


(1.0, 0.8857142857142858, 0.8583333333333333)

#### Prediction: Logistic Regresion

In [6]:
X_predict = [
    ["Dari pada loe beban keluarga, ga guna, ga usah hidup aja sekalian"], #negative Comment
    ["Kamu hari ini terlihat cantik banget"] #positive Comment
]

In [7]:
model.predict(X_predict[0]) # Predict Negative Comment

array(['negative'], dtype=object)

In [8]:
model.predict(X_predict[1]) # Predict positive Comment

array(['positive'], dtype=object)

# Algorithm: KNN

In [9]:
#Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ('algo',KNeighborsClassifier())
])

#Hyperparameter Tunning
parameter = {
    "algo__n_neighbors" : range(1,51,1),
    "algo__weights" : ["distance","uniform"],
    "algo__p" : [1,2]
}

#Training
model = GridSearchCV(estimator=pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train,y_train)
model.score(X_train,y_train), model.best_score_, model.score(X_test,y_test)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  1.4min finished
  'stop_words.' % sorted(inconsistent))


(1.0, 0.8750476626248761, 0.85)

### Prediction: KNN

In [10]:
X_predict = [
    ["Dari pada loe beban keluarga, ga guna, ga usah hidup aja sekalian"], #negative Comment
    ["Kamu hari ini terlihat cantik banget"] #positive comment
]

In [11]:
model.predict(X_predict[0]) # Predict Negative Comment

array(['negative'], dtype=object)

In [12]:
model.predict(X_predict[1]) # Predict positive Comment

array(['positive'], dtype=object)

# Algorithm: SVM

In [13]:
#Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ('algo',SVC(max_iter=500))
])

#Hyperparameter Tunning
parameter = {
    "algo__kernel" : ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
    "algo__C" : [0.001, 0.01, 0.1, 1, 10],
    "algo__gamma" : ['scale',10, 5, 1, 0.1]
}

#Training
model = GridSearchCV(estimator=pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train,y_train)
model.score(X_train,y_train), model.best_score_, model.score(X_test,y_test)

Fitting 3 folds for each of 125 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   34.8s
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:  1.1min finished
  'stop_words.' % sorted(inconsistent))


(1.0, 0.8749332723251735, 0.8583333333333333)

### Prediction: SVM

In [14]:
X_predict = [
    ["Dari pada loe beban keluarga, ga guna, ga usah hidup aja sekalian"], #negative Comment
    ["Kamu hari ini terlihat cantik banget"] #positive Comment
]

In [15]:
model.predict(X_predict[0]) # Predict Negative Comment

array(['negative'], dtype=object)

In [16]:
model.predict(X_predict[1]) # Predict positive Comment

array(['positive'], dtype=object)

# Kesimpulan

Akurasi yang dihasilkan dengan menggunakan Algoritma Logistic Regresion, KNN, SVM tidak jauh berbeda, rata - rate score yang dihasilkan berada di angka 0.85, akan tetapi dari segi proses training Logistic Regresion lebih baik dari KNN dan SVM karena waktu yang dibutuhkan Logistic Regresion lebih cepat dari KNN dan SVM 