# Import Semua Library Yang Dibutuhkan

In [None]:
import pandas as pd 
import numpy as np
import sys
import codecs
import nltk
import re
import math
import string
from collections import Counter
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from skimage import io
from skimage import feature

import matplotlib.pyplot as plt

# Membaca Dataset Yang Sudah Dilakukan Text Processing

In [None]:
dataset = pd.read_csv('../input/covid19-tweet-indonesia-positif-dan-negatif/tweet_covid_19_text_processing_new.csv')

dataset.head()

# Menghapus Beberapa Kolom Yang Tidak Dibutuhkan

In [None]:
#Delete Kolom Yang Tidak Perlu
del dataset['Unnamed: 0']
del dataset['Tweet']
del dataset['text_remove_hashtag_and_mentions']
del dataset['text_remove_url']
del dataset['text_remove_punc']
del dataset['text_remove_emojis']
del dataset['text_remove_emoticons']
del dataset['cleansing_tweets']
del dataset['case_folding_tweets']
del dataset['tweet_tokens']
del dataset['tweet_normalized']

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

dataset["tweet_final"] = dataset["tweet_tokens_stemmed"].apply(lambda text: remove_punctuation(text))
dataset.head(6)


# Menampilkan Hasil Dari Hasil Ekseskusi Sebelumnya #Tweet_Final

In [None]:
dataset.head(1000)

# Melihat Seberapa Banyak Nilai Positif dan Negatif Tweet Pada Dataset

In [None]:
freq = pd.Series(' '.join(dataset['Label']).split()).value_counts()[:10000]
freq

# Fungsi Unigram, Bigram dan Trigram
1. Unigram
2. Bigram
3. Trigram

In [None]:
#Unigram

nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer


token= RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(ngram_range=(1,1), tokenizer = token.tokenize)
text_count = cv.fit_transform(dataset['tweet_final'])
frequencies = sum(text_count).toarray()[0]
df = pd.DataFrame(frequencies, index=cv.get_feature_names(), columns=['frequency'])
df = df.reindex(sorted(df.columns), axis=1)

df.sort_values(by='frequency', ascending=False)

In [None]:
# #Bigram

# nltk.download('stopwords')
# from nltk.corpus import stopwords
# from sklearn.feature_extraction.text import CountVectorizer
# from nltk.tokenize import RegexpTokenizer


# token= RegexpTokenizer(r'[a-zA-Z0-9]+')
# cv = CountVectorizer(ngram_range=(2,2), tokenizer = token.tokenize)
# text_count = cv.fit_transform(dataset['tweet_final'])
# frequencies = sum(text_count).toarray()[0]
# df = pd.DataFrame(frequencies, index=cv.get_feature_names(), columns=['frequency'])
# df = df.reindex(sorted(df.columns), axis=1)

# df.sort_values(by='frequency', ascending=False)

In [None]:
# #Trigram

# nltk.download('stopwords')
# from nltk.corpus import stopwords
# from sklearn.feature_extraction.text import CountVectorizer
# from nltk.tokenize import RegexpTokenizer


# token= RegexpTokenizer(r'[a-zA-Z0-9]+')
# cv = CountVectorizer(ngram_range=(3,3), tokenizer = token.tokenize)
# text_count = cv.fit_transform(dataset['tweet_final'])
# frequencies = sum(text_count).toarray()[0]
# df = pd.DataFrame(frequencies, index=cv.get_feature_names(), columns=['frequency'])
# df = df.reindex(sorted(df.columns), axis=1)

# df.sort_values(by='frequency', ascending=False)

# Membuat Data Training dan Data Testing (Validation Data)

In [None]:
from sklearn.model_selection import train_test_split

X_train, x_test, Y_train, y_test = train_test_split(text_count, dataset['Label'], test_size = 0.3, random_state = 5)

print(X_train.shape, x_test.shape, Y_train.shape, y_test.shape)

# Penerapan Dengan Metode Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

MNB = MultinomialNB()
MNB = MNB.fit(X_train, Y_train)
MNB_prediction = MNB.predict(x_test)

print('Akurasi = ', accuracy_score(y_test, MNB_prediction))
print(classification_report(y_test, MNB_prediction))

cm_matrix = pd.DataFrame(data=confusion_matrix(y_test, MNB_prediction), columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu');

# Penerapan Cross Validation Dengan Metode Naive Bayes Classifier

In [None]:
kfold=KFold(n_splits=5, shuffle=True, random_state=0)

accuracy = cross_val_score(MNB,X_train,Y_train, cv=kfold, scoring='accuracy')
precision = cross_val_score(MNB,X_train,Y_train, cv=kfold, scoring='precision_weighted')
recall = cross_val_score(MNB,X_train,Y_train, cv=kfold, scoring='recall_weighted')
f1 = cross_val_score(MNB,X_train,Y_train, cv=kfold, scoring='f1_weighted')
print('accuray',  accuracy.mean())
print('precision' , precision.mean())
print('recall' ,recall.mean())
print('F1-Measure' , f1.mean())

# Visualisasi Box Plot

In [None]:
#BOXPLOT VISUALIZATION

fig1, ax1 = plt.subplots(figsize=(10,5))

#green_diamond = dict(markerfacecolor='g', marker='D')
red_square = dict(markerfacecolor='r', marker='s')


# grouping
all_data = [accuracy,precision,recall,f1]
ax1.set_title('performance - boxplot')

# plot box plot
ax1.boxplot(all_data,notch=False,flierprops=red_square)




#adding horizontal grid lines
ax1.yaxis.grid(True)
ax1.set_xticks([y +1 for y in range(len(all_data))])
ax1.set_xlabel('performa')
ax1.set_ylabel('score')

#add x-tick labels
plt.setp(ax1, xticks=[y+1 for y in range(len(all_data))],
         xticklabels=[ 'accuracy','precision','recall','f1_score'])
plt.show()

# Visualisasi Cross Validation dan Performa

In [None]:
plt.figure(figsize=(10,7))
xx = ["cv1", "cv2", "cv3", "cv4", "cv5"] #, "cv6", "cv7", "cv8", "cv9", "cv10"
plt.plot(xx, accuracy, '--')
plt.plot(xx, precision, '--')
plt.plot(xx, recall, '--')
plt.plot(xx, f1, '--')
plt.title("comparison of each crossvalidation - NBC")
plt.xlabel("Crossvaldiation")
plt.ylabel("score")
plt.legend(["accuracy","precision", "recall", "f1-score"])
plt.grid()
plt.show()