<h4>Langkah 1: Instalasi Library</h4>

In [None]:
!pip install google-api-python-client google-auth google-auth-oauthlib google-auth-httplib2
!pip install sastrawi textblob wordcloud nltk scikit-learn


<h4>Langkah 2: Import Library

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string
import re
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import ConfusionMatrixDisplay
from textblob import TextBlob
from sklearn.metrics import confusion_matrix, accuracy_score

nltk.download('punkt')
nltk.download('stopwords')


<h4>Langkah 3: Mengambil Komentar dari YouTube

In [None]:
from googleapiclient.discovery import build

api_key = input("API KEY: ")
youtube = build("youtube", "v3", developerKey=api_key)
url = input("VIDEO URL: ")

def get_comments(url):
    single_video_id = url.split("=")[1].split("&")[0]
    video_list = youtube.videos().list(part="snippet", id=single_video_id).execute()
    channel_id = video_list["items"][0]["snippet"]["channelId"]
    title_single_video = video_list["items"][0]["snippet"]["title"]

    nextPageToken_comments = None
    commentsone = []

    while True:
        pl_request_comment = youtube.commentThreads().list(
            part=["snippet", "replies"],
            videoId=single_video_id,
            maxResults=50,
            pageToken=nextPageToken_comments
        )
        pl_response_comment = pl_request_comment.execute()

        for i in pl_response_comment["items"]:
            vid_comments = i["snippet"]["topLevelComment"]["snippet"]["textOriginal"]
            comm_author = i["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
            comm_author_id = i["snippet"]["topLevelComment"]["snippet"]["authorChannelId"]["value"]
            comm_date = i["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
            comm_likes = i["snippet"]["topLevelComment"]["snippet"]["likeCount"]

            commentsone.append({
                "comm_date": comm_date,
                "author": comm_author,
                "author_id": comm_author_id,
                "likes": comm_likes,
                "comment": vid_comments,
                "video_id": single_video_id
            })

        nextPageToken_comments = pl_response_comment.get("nextPageToken")
        if not nextPageToken_comments:
            break

    for i in commentsone[:10]:
        print(i["comment"])

    pd.DataFrame.from_dict(commentsone).to_csv("dataset.csv")

get_comments(url)


<h4>Langkah 4: Menampilkan Hasil Scraping
</h4>

Menampilkan hasil scraping dari file dataset.csv.

In [ ]:
df = pd.read_csv('dataset.csv')
df.head(500)
df.count()


<h4>Langkah 5: Membersihkan Data
</h4>

Menghapus kolom yang tidak diperlukan.

In [ ]:
data = pd.read_csv("dataset.csv")
data = data.dropna()
print(data.head())

data_nw = data.drop(['comm_date', "author", 'author_id', "likes", 'video_id'], axis=1)
data_nw.to_csv("dataset_drop.csv")


<h4>Langkah 6: Membuka Dataset yang Telah Dibersihkan

In [ ]:
data_baru = pd.read_csv("dataset_drop.csv")
data_baru.head()


<h4>Langkah 7: Preprocessing Data

In [ ]:
def caseFolding(comment):
    comment = comment.lower()
    comment = comment.strip(" ")
    comment = re.sub(r'[?|$|.|!]', r'', comment)
    comment = re.sub(r'[^a-zA-Z0-9 ]', r'', comment)
    return comment

data_baru['comment'] = data_baru['comment'].apply(caseFolding)
data_baru.to_csv("dataset_bersih.csv")


<h4>Langkah 8: Memberi Label Sentimen<br>

Secara manual, tambahkan kolom 'sentimen' pada dataset_bersih.csv dengan nilai 'positif', 'negatif', atau 'netral'.

<h4>Langkah 9: Klasifikasi Sentimen Menggunakan KNN

In [ ]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

nltk.download('stopwords')

data = pd.read_csv('dataset_bersih.csv')
X = data['comment']
y = data['sentimen']

def preprocessing(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return ' '.join(stemmed_tokens)

X = X.apply(preprocessing)

In [ ]:

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [ ]:

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)


In [ ]:

y_pred = knn.predict(X_test)


In [ ]:

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('Confusion Matrix:\n', cm)
