In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import re
import pickle

In [None]:
data = pd.read_csv('/kaggle/input/tweeter-covid-surabaya/dataset_twitter-scraper_2023-12-05_16-03-34-228.csv')

In [None]:
data.head()

In [None]:
data = data[data['lang'] == 'in']

In [None]:
data.head()

In [None]:
data.info()

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

nltk_stw_id = stopwords.words('indonesian')

In [None]:
!unzip -o /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
import re


def preprocess(textdata, stopwords_id, emojis):
    processedText = []

    # Create Lemmatizer
    wordLemm = WordNetLemmatizer()

    # Defining regex patterns.
    urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern = '@[^\s]+'
    alphaPattern = "[^a-zA-Z0-9\s]"
    sequencePattern = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"

    for tweet in textdata:
        tweet = tweet.lower();
        
        tweet = re.sub(r'\s+', ' ', tweet).strip()

        # Replace all URLs with 'URL'
        tweet = re.sub(urlPattern, ' URL', tweet)
        # Replace all emojis.
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
        # Replace @USERNAME with 'USER'.
        tweet = re.sub(userPattern, ' USER', tweet)
        # Replace all non-alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letters.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
        
        #Replace more than 1 blank space to only one
        tweet = re.sub(r'\s+', ' ', tweet).strip()
        
        #Replace number or digit to 'NUM'
        tweet = re.sub(r'\d+', ' NUM ', tweet)
        
        #Remove special tags
        tweet=re.sub("&lt;/?.*?&gt;","&lt;&gt;",tweet)

        tweetwords = ''
        
        for word in tweet.split():
            # Checking if the word is a stopword.
            # if word not in stopwords_en and word not in stopwords_id:
            if word not in stopwords_id:
                if len(word) > 1:
                    # Lemmatizing the word.
                    word = wordLemm.lemmatize(word)
                    tweetwords += (word + ' ')

        processedText.append(tweetwords)

    return processedText

In [None]:
emojis_dict = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [None]:
data['processed_text'] = preprocess(data['full_text'], nltk_stw_id, emojis_dict)
print(data[['full_text', 'processed_text']].head(10))

In [None]:
from spacy.lang.id import Indonesian

spacy_id = Indonesian()
tokenized_texts = []

for text in data['processed_text']:
    word_tokens = spacy_id(text)
    tokenized_texts.append([token.text for token in word_tokens])

# Menambahkan kolom baru 'word_tokens' ke dalam DataFrame
data['word_tokens'] = tokenized_texts

# Menampilkan hasil
for index, row in data.head(10).iterrows():
    print(f"Index: {index}")
    print(f"\nWord Tokens: {row['word_tokens']}\n")
    print(f"Processed Text: {row['processed_text']}")
    print("\n" + "-"*50 + "\n")

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5)
# X = tfidf_vectorizer.fit_transform(data['processed_text'])
# print(X.shape)
# print(X)

In [None]:
lexicon_negative = pd.read_csv('/kaggle/input/indo-sentiment-lexicon/negative.tsv', sep='\t')
lexicon_positive= pd.read_csv('/kaggle/input/indo-sentiment-lexicon/positive.tsv', sep='\t')

lexicon = pd.concat([lexicon_negative, lexicon_positive], ignore_index=True)

lexicon.head()

In [None]:
data.head()

In [None]:
data.info()

In [None]:
search_lex_word = lexicon['word'].values
search_lex_word[:100]

In [None]:
print("katanya" in search_lex_word)

In [None]:
word_tokens = data['word_tokens']

contain = 0
notcontain = 0

for words in word_tokens:
    flag = False
    for word in words:
        if word in search_lex_word:
            flag = True
            continue
    if flag:
        contain += 1
        continue
    notcontain += 1
    
    
print(contain, notcontain)

In [None]:
bow = data[['processed_text', 'word_tokens']].copy()
bow.reset_index(drop=True)

In [None]:
bow.tail()

In [None]:
bow.loc[2206]

In [None]:

col_bow = np.array([])
bow_arr = np.zeros((len(bow["word_tokens"]), 0))

for row_idx, token in enumerate(bow["word_tokens"]):
    for word in token:
        if word in search_lex_word:
            weight = lexicon[lexicon['word'] == word].iloc[0]['weight']
            if word in col_bow:
                col_idx = np.where(col_bow == word)[0]
                bow_arr[row_idx, col_idx] = weight
            else:
                col_bow = np.append(col_bow, word)
                new_col = np.zeros((len(bow["word_tokens"]), 1))
                bow_arr = np.hstack([bow_arr, new_col])
                col_idx = np.where(col_bow == word)[0]
                bow_arr[row_idx, col_idx[-1]] = weight

print(bow_arr)

In [None]:
bow_arr.shape

In [None]:
row_sums = np.sum(bow_arr, axis=1)

# Reshape the row_sums to have a compatible shape for stacking
row_sums = row_sums.reshape(-1, 1)

# Horizontally stack the row sums to each row of the original array
bow_arr_with_sums = np.hstack((bow_arr, row_sums))

# Print the updated array with sums
print(bow_arr_with_sums)

In [None]:
bow_arr_with_sums.shape

In [None]:
col_bow = np.append(col_bow, "sentiment")

col_bow.shape

In [None]:
col_bow

In [None]:
bow_arr_df = pd.DataFrame(bow_arr_with_sums, columns=col_bow)
bow_arr_df.tail()

In [None]:
bow = bow.reset_index(drop=True)
bow

In [None]:
bow_arr_df = bow_arr_df.reset_index(drop=True)
bow_arr_df

In [None]:
bow.tail()

In [None]:
ccat_df = pd.concat([bow, bow_arr_df], axis=1)
ccat_df.tail()

In [None]:
ccat_df[ccat_df["sentiment"] > 0]

In [None]:
sentiment_df = ccat_df[['processed_text','word_tokens', 'sentiment']].copy()
sentiment_df

In [None]:
def label_sentiment(score):
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'
    
sentiment_df['sentiment_label'] = sentiment_df['sentiment'].apply(label_sentiment)

sentiment_df.tail()

In [None]:
sentiment_counts = sentiment_df['sentiment_label'].value_counts()

plt.figure(figsize=(8, 6))
sentiment_counts.plot(kind='bar', color=['red', 'blue', 'green']) 
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

data_neg = sentiment_df[sentiment_df["sentiment_label"] == "negative"]["processed_text"]
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(data_neg))
plt.imshow(wc)

In [None]:
data_pos = sentiment_df[sentiment_df["sentiment_label"] == "positive"]["processed_text"]
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(data_pos))
plt.imshow(wc)

In [None]:
data_neu = sentiment_df[sentiment_df["sentiment_label"] == "neutral"]["processed_text"]
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(data_neu))
plt.imshow(wc)

In [None]:
# representasi vektor dengan VSM-TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

processedtext = sentiment_df["processed_text"]
sentiment = sentiment_df["sentiment_label"]

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
X = tfidf_vectorizer.fit_transform(processedtext)
print(X.shape)

In [None]:
from sklearn import cluster

k = 3
seed = 99
km = cluster.KMeans(n_clusters=k, init='random', max_iter=300, random_state = seed)
km.fit(X)
'Done'

In [None]:
# Hasil clusteringnya
C_km = km.predict(X)
C_km[:10]

In [None]:
kmPP = cluster.KMeans(n_clusters=k, init='k-means++', max_iter=300, tol=0.0001, random_state = seed)
kmPP.fit(X)
C_kmpp = kmPP.predict(X)
C_kmpp[:10]

In [None]:
from sklearn.metrics import silhouette_score as siluet
C = [C_km, C_kmpp]

for res in C:
    print(siluet(X,res), end=', ')

In [None]:
from sklearn.metrics import normalized_mutual_info_score as NMI

for res in C:
    print(NMI(sentiment,res), end=', ')

In [None]:
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE

# def plot_tsne_pca(data, labels):
#     max_label = max(labels)
    
#     pca = PCA(n_components=2).fit_transform(data.todense())
#     tsne = TSNE().fit_transform(PCA(n_components=50).fit_transform(data[max_items,:].todense()))
    
    
#     idx = np.random.choice(range(pca.shape[0]), size=300, replace=False)
#     label_subset = labels[max_items]
#     label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
#     f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
#     ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
#     ax[0].set_title('PCA Cluster Plot')
    
#     ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
#     ax[1].set_title('TSNE Cluster Plot')

# plot_tsne_pca(X, C_kmpp)

In [None]:
label = sentiment
print(label)

print(len(label))
print(len(processedtext))
print(len(X.getnnz(1)))
print(X.shape)

In [None]:
Y = label[X.getnnz(1)>0]
X_clear = X[X.getnnz(1)>0]

# Hapus dokumen yang memiliki nilai row 0
docs_nonzero = list()
for i, d in enumerate(X.getnnz(1)>0):
    if d:
        docs_nonzero.append(processedtext[i])

print(X_clear.shape, len(Y), len(docs_nonzero))

In [None]:
# Membagi data menjadi dua yaitu data training dan data testing
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_clear, Y, test_size=.3, random_state=8989)
print(X_train.shape, X_test.shape)

In [None]:
# KNN menggunakan scikit-learn library
# http://scikit-learn.org/stable/modules/neighbors.html
from sklearn import neighbors

K = 3
weights = 'distance'
kNN = neighbors.KNeighborsClassifier(K, weights=weights)
kNN.fit(X_train, Y_train)
print('Done!')

In [None]:
# Prediksi dengan KNN
Y_knn = kNN.predict(X_test)

In [None]:
# Evaluasi dengan akurasi
from sklearn.metrics import accuracy_score

accuracy_score(Y_test, Y_knn)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(processedtext, sentiment,
                                                    test_size = 0.05, random_state = 0)
print(f'Data Split done.')

In [None]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
# print('No. of feature_words: ', len(vectoriser.get_feature_names()))

In [None]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)
print(f'Data Transformed.')

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

def model_Evaluate(model):
    
    # Predict values for Test dataset
    y_pred = model.predict(X_test)

    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    categories = ['negative', 'positive', 'neutral']
    group_names = ['True Neg', 'False Pos', 'False Neu', 'False Neg', 'True Pos', 'False Neu', 'False Neg', 'False Pos', 'True Neu']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names, group_percentages)]

    # Check if the number of elements is correct for a 3x3 matrix
    labels = np.asarray(labels).reshape(3, 3)

    sns.heatmap(cf_matrix, annot=labels, cmap='Blues', fmt='',
                xticklabels=categories, yticklabels=categories)

    plt.xlabel("Predicted values", fontdict={'size': 14}, labelpad=10)
    plt.ylabel("Actual values", fontdict={'size': 14}, labelpad=10)
    plt.title("Confusion Matrix", fontdict={'size': 18}, pad=20)


In [None]:
from sklearn.naive_bayes import BernoulliNB
BNBmodel = BernoulliNB(alpha = 2)
BNBmodel.fit(X_train, y_train)
model_Evaluate(BNBmodel)

In [None]:
import pickle

file = open('vectoriser-ngram-(1,2).pickle','wb')
pickle.dump(vectoriser, file)
file.close()