# Analisis Sentimen Menggunalan RoBERTa+GRU-CNN

## Import Modul dan Install Package yang Diperlukan

In [None]:
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# GET the data
## Memuat Data
import pandas as pd
import io
import tensorflow as tf
from tensorflow import keras
from transformers import AutoTokenizer, TFAutoModel
import IPython
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense
from keras_tuner.tuners import BayesianOptimization

## Data Preparation

### Data Train

In [None]:
dataset_train= pd.read_csv(r"/content/train.csv")
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219294 entries, 0 to 219293
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  219294 non-null  int64 
 1   tweets      219294 non-null  object
 2   labels      219294 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.0+ MB


In [None]:
del dataset_train['Unnamed: 0']

In [None]:
## Mengecek Imbalanced Data
dataset_train['labels'].value_counts()

bad        107796
good        56011
neutral     55487
Name: labels, dtype: int64

In [None]:
dataset_train = dataset_train[dataset_train.labels != 'neutral']

In [None]:
dataset_train = dataset_train.sample(n=10000,random_state=7)

In [None]:
dataset_train=dataset_train.reset_index(drop=True)

In [None]:
dataset_train['labels'].value_counts()

bad     6555
good    3445
Name: labels, dtype: int64

### Data Prediksi

In [None]:
dataset_prediksi = pd.read_csv(r"/content/prediksi.csv")
dataset_prediksi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500036 entries, 0 to 500035
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   date           500036 non-null  object 
 1   id             500030 non-null  object 
 2   content        500030 non-null  object 
 3   username       500002 non-null  object 
 4   like_count     499974 non-null  float64
 5   retweet_count  499974 non-null  float64
dtypes: float64(2), object(4)
memory usage: 22.9+ MB


In [None]:
dataset_prediksi=dataset_prediksi.drop(['date', 'id', 'username','like_count','retweet_count'], axis=1)

In [None]:
dataset_prediksi.rename(columns = {'content':'tweets'}, inplace = True)

In [None]:
dataset_prediksi = dataset_prediksi.sample(n=10000,random_state=7)

In [None]:
dataset_prediksi=dataset_prediksi.reset_index(drop=True)

In [None]:
dataset_prediksi.head()

Unnamed: 0,tweets
0,3 Mind-Blowing Chat GPT Hacks for Small YouTub...
1,Experts exploring how systems that leverage la...
2,Digital marketing is the practice of promoting...
3,(New York Post):#BuzzFeed stock surges on plan...
4,"Hey @LinusTech , look what #ChatGPT and I made..."


## Data Cleaning

In [None]:
# MODEL the data
## Pra Pengolahan - Cleaning
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('all')

def clean_text(tweet):
    
    # Convert to lower case
    tweet = tweet.lower()
    # remove unicode characters
    tweet = tweet.encode('ascii', 'ignore').decode()
    # Clean www.* or https?://*
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    # Clean @username
    tweet = re.sub('@[^\s]+','',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    
    return tweet

lemmatizer = WordNetLemmatizer()
def lemmatize(sentence):
    sw = stopwords.words('english')
    sw.append('chatgpt')
    sw.append('gpt')
    sw.append('openai')
    sw.append('ai')
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words if word not in sw]
    words = ' '.join(words)
    return words

dataset_train["tweets"] = dataset_train['tweets'].map(lambda x: clean_text(x))
dataset_train['tweets'] = dataset_train['tweets'].map(lambda x: lemmatize(x))
dataset_train = dataset_train[dataset_train['tweets'].apply(lambda x: len(x.split()) >=1)]

dataset_prediksi["tweets"] = dataset_prediksi['tweets'].map(lambda x: clean_text(x))
dataset_prediksi['tweets'] = dataset_prediksi['tweets'].map(lambda x: lemmatize(x))
dataset_prediksi = dataset_prediksi[dataset_prediksi['tweets'].apply(lambda x: len(x.split()) >=1)]

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

## One-Hot Encoding pada Labels Data Train

In [None]:
## One hot encoding pada label
label = np.array(pd.get_dummies(dataset_train['labels']))
label

array([[0, 1],
       [1, 0],
       [0, 1],
       ...,
       [0, 1],
       [0, 1],
       [1, 0]], dtype=uint8)

## Pemisahan Data Train-Val-Test pada Data Train

In [None]:
## Pra Pengolahan - Splitting
from sklearn.model_selection import train_test_split

train_data, test_data, train_labels, test_labels = train_test_split(
    dataset_train['tweets'], label, test_size=0.3, random_state=7)
val_data, test_data, val_labels, test_labels = train_test_split(test_data, test_labels, test_size=.5,
                                                random_state=7)
train_data.shape, train_labels.shape, test_data.shape, test_labels.shape, val_data.shape, val_labels.shape

((6945,), (6945, 2), (1489,), (1489, 2), (1488,), (1488, 2))

## Memanggil Model dan Tokenizer RoBERTa

In [None]:
roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
roberta_model = TFAutoModel.from_pretrained('roberta-base', trainable=False)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


## Definisikan Fungsi untuk Tokenisasi pada Satu Data

In [None]:
def tokenisasi(teks):
    encode_dict = roberta_tokenizer(teks,
                                   add_special_tokens = True,
                                   max_length = 128, 
                                   padding = 'max_length',
                                   truncation = True,
                                   return_attention_mask = True,
                                   return_tensors = 'tf',)

    tokenID = encode_dict['input_ids']
    attention_mask = encode_dict['attention_mask']

    return tokenID, attention_mask

## Definisikan Fungsi untuk Mengambil Tokenisasi pada Semua Data

In [None]:
def create_input(data):
    tokenID, input_mask = [], []
    for teks in data:
        token, mask = tokenisasi(teks)
        tokenID.append(token)
        input_mask.append(mask)
    
    return [np.asarray(tokenID, dtype=np.int32).reshape(-1, 128), 
            np.asarray(input_mask, dtype=np.int32).reshape(-1, 128)]

## Definisikan Fungsi untuk Model RoBERTa+GRU-CNN

In [None]:
def roberta(hp):
    
    #Input layer
    input_token = keras.layers.Input(shape=(128,), dtype=np.int32,
                                        name="input_token")
    input_mask = keras.layers.Input(shape=(128,), dtype=np.int32,
                                   name="input_mask")

    #Embedding
    roberta_embedding = roberta_model([input_token, input_mask])[0]
    #GRU layer
    gru = keras.layers.GRU(units = hp.Int('units',
                                             min_value = 100,
                                             max_value = 200,
                                             step = 50),
                                      kernel_regularizer=keras.regularizers.l2(hp.Choice('kernel_regularizer',
                                                                                         values = [0.01, 0.001])),
                                      recurrent_regularizer=keras.regularizers.l2(hp.Choice('rec_regularizer',
                                                                                            values = [0.01, 0.001])),
                                      return_sequences=True)(roberta_embedding)

    #Convolution layer
    cnn = keras.layers.Conv1D(filters = hp.Int('filters',
                                                min_value = 200, 
                                                max_value = 300, 
                                                step = 50),
                              kernel_size = hp.Int('kernel_size',
                                                    min_value = 3, 
                                                    max_value = 5, 
                                                    step = 1),
                              activation='relu',
                              kernel_regularizer = keras.regularizers.l2(hp.Choice('kernel_cnn',
                                                                                    values = [0.01, 0.001])))(gru)
    #Max Pooling layer
    maxpool = keras.layers.GlobalMaxPooling1D()(cnn)
    
    #Dropout layer
    cnn_out = keras.layers.Dropout(0.5)(maxpool)
 
    #Output layer
    output = keras.layers.Dense(2, activation='sigmoid', kernel_regularizer=keras.regularizers.l2(hp.Choice('kernel_dense', values = [0.01, 0.001])))(cnn_out)


    model = keras.models.Model(inputs=[input_token, input_mask], outputs=output)
    model.compile(optimizer = keras.optimizers.Adam(1e-3),
                  loss ='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

class ClearTrainingOutput(keras.callbacks.Callback):
    def on_train_end(*args, **kwargs):
        IPython.display.clear_output(wait = True)

early_stop = keras.callbacks.EarlyStopping(patience=5, monitor='val_accuracy',
                                              restore_best_weights=True,
                                              verbose=1)

## Membuat tokenID untuk X_train, X_test, dan X_val

In [None]:
roberta_train_data = create_input(train_data)
roberta_val_data = create_input(val_data)
roberta_test_data = create_input(test_data)

## Hyperparameter Tuning

In [None]:
tuner = BayesianOptimization(roberta,
                             objective = 'val_accuracy', 
                             max_trials = 5,
                             directory = '/content/Hasil',
                             project_name = 'Sentiment-RoBERTa',
                             overwrite = False)

tuner.search(roberta_train_data, train_labels,
             batch_size=32, epochs=50,
             validation_data=(roberta_val_data, val_labels),
             callbacks=[early_stop, ClearTrainingOutput()])

Trial 2 Complete [00h 19m 46s]
val_accuracy: 0.8655914068222046

Best val_accuracy So Far: 0.8655914068222046
Total elapsed time: 00h 35m 23s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
100               |150               |units
0.001             |0.001             |kernel_regularizer
0.01              |0.01              |rec_regularizer
200               |250               |filters
5                 |3                 |kernel_size
0.01              |0.001             |kernel_cnn
0.001             |0.01              |kernel_dense

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
 19/218 [=>............................] - ETA: 55s - loss: 0.4402 - accuracy: 0.8470

## Mendapatkan Hyperparameter yang Optimal

In [None]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters()[0]
print('\nThe hyperparameter search is complete.'
      '\nunits:', best_hps.get('units'),
      '\nkernel regularizer:', best_hps.get('kernel_regularizer'),
      '\nrec regularizer:', best_hps.get('rec_regularizer'),
      '\nfilters:', best_hps.get('filters'),
      '\nkernel size:', best_hps.get('kernel_size'),
      '\nkernel cnn:', best_hps.get('kernel_cnn'),
      '\nkernel dense:', best_hps.get('kernel_dense')
      )

## Mendapatkan Model Terbaik untuk Prediksi

In [None]:
model = tuner.get_best_models()[0]

## Mendapatkan Kinerja Model

In [None]:
## Evaluasi Model Data train
test_loss, test_acc = model.evaluate(roberta_test_data, test_labels)
print('Test accuracy:', test_acc)

## Melakukan Prediksi pada Data Prediksi

In [None]:
roberta_pred_data=create_input(dataset_prediksi['tweets'])

In [None]:
y_pred = np.argmax(model.predict(roberta_pred_data), axis=1)

In [None]:
dataset_prediksi['Sentiment'] = y_pred

In [None]:
dataset_prediksi

In [None]:
from google.colab import files
dataset_prediksi.to_csv('dataset_prediksi_with_sentiment.csv', encoding = 'utf-8-sig') 
files.download('dataset_prediksi_with_sentiment_.csv')

# Deteksi Topik Menggunakan Roberta-EFCM

### Dataset

In [None]:
dataset_prediksi= pd.read_csv(r"/content/dataset_prediksi_with_sentiment.csv")

In [None]:
del dataset_prediksi['Unnamed: 0']

In [None]:
dataset= dataset_prediksi

In [None]:
del dataset['Sentiment']

### Representasi Topik dengan RoBERTa

In [None]:
import numpy as np

In [None]:
## Pra Pengolahan - Representasi Data

def tokenisasi(teks):
    encode_dict = roberta_tokenizer(teks,
                                   add_special_tokens = True,
                                   max_length = 128, 
                                   padding = 'max_length',
                                   truncation = True,
                                   return_attention_mask = True,
                                   return_tensors = 'tf',)

    tokenID = encode_dict['input_ids']
    attention_mask = encode_dict['attention_mask']

    return tokenID, attention_mask
    return tokenID

def roberta_(data):
    emb = []
    for teks in data:
        #print(teks)
        token, mask = tokenisasi(str(teks))
        emb.append(roberta_model([token, mask])[1].numpy().tolist()[0])
    
    return emb

### BERT Embedding
roberta_emb = np.array(roberta_(dataset.to_numpy()))

print(roberta_emb.shape)

(9993, 768)


### Pendefinisian fungsi-fungsi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import sys
sys.path.insert(0, "/content/drive/MyDrive/Deep Learning/FCMeans")
from fcmeans import fcmeans

In [None]:
### Topic Interpretation
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

#### Class Based TFIDF (C-TFIDF) Functions
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

def extract_top_words_per_topic(tf_idf, count, docs_per_topic, n):
    words = count.get_feature_names_out()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    top_words = []
    for i in range(len(labels)):
      top_words.append([words[j] for j in indices [i]][::-1])
    return top_words

In [None]:
### Topic Coherence
import gensim
from itertools import combinations
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Memuat Model Word2Vec
w2v_model = gensim.models.Word2Vec.load("/content/drive/MyDrive/Deep Learning/Data/word2vec/idwiki-berita/w2v-model.bin")

### Fungsi Menghitung Coherence
def calculate_coherence( w2v_model, term_rankings ):
    overall_coherence = 0.0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations( term_rankings[topic_index], 2 ):
            if pair[0] in w2v_model.wv.index_to_key:
                if pair[1] in w2v_model.wv.index_to_key:
                    pair_scores.append( abs(w2v_model.wv.similarity(pair[0], pair[1])) )
                #else:
                    #pair_scores.append(0)
            #else:
                #pair_scores.append(0)
        
        # get the mean for all pairs in this topic
        if pair_scores:
            topic_score = sum(pair_scores) / len(pair_scores)
            overall_coherence += topic_score
    
    # get the mean score across all topics
    return overall_coherence / len(term_rankings)

In [None]:
def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

### Tuning Hyperparameter 1

In [None]:
## Model EFCM - Reduksi Dimensi
from sklearn.decomposition import TruncatedSVD

In [None]:
## Parameter Tuning
### Jumlah Topik Optimal
import pandas as pd

val_m=[]
num_components=[]
num_topics = []
num_top_words=[]
coherences = []
for n_top_words in [10,15,20]:
  num_top_words.append(n_top_words)
  for n_topics in [5,10,15,20]:
    num_topics.append(n_topics)
    for n_components in [3,5]:
      num_components.append(n_components)
      for m in [1.1,1.3]:
        val_m.append(m)
        #topic detection
        svd = TruncatedSVD(n_components)
        data = svd.fit_transform(roberta_emb)
        initkm = KMeans(n_clusters=n_topics, n_init=1).fit(data)
        cntr, u = fcmeans(data.T, n_topics, m, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
        cluster_membership = np.argmax(u, axis=0)
          
        docs = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
        docs['Topic'] = cluster_membership
        docs['Doc_ID'] = range(len(docs))
        docs_per_topic = docs.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          
        tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
        top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, n_top_words)
          
        coherences.append(calculate_coherence(w2v_model, top_words))
        print('Number of Top Words : ',n_top_words)
        print('Number of Topics : ',n_topics)
        print('Number of n components: ',n_components)
        print('Value of m : ',m)
        print('Coherence value : ',calculate_coherence(w2v_model, top_words))
        print('--------------------------------------------------------------')

Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.3
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.1
Coherence value :  0.266711268145591
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.3
Coherence value :  0.2524867153002156
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  10
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26520570978744046
--------------------------------------------------------------
Number of Top Words :  10
N

### Tuning Hyperparameter 2

In [None]:
## Model EFCM - Reduksi Dimensi
from sklearn.decomposition import TruncatedSVD

In [None]:
## Parameter Tuning
### Jumlah Topik Optimal
import pandas as pd

val_m=[]
num_components=[]
num_topics = []
num_top_words=[]
coherences = []
for n_top_words in [10,15,20]:
  num_top_words.append(n_top_words)
  for n_topics in [5,10,15,20]:
    num_topics.append(n_topics)
    for n_components in [3,5]:
      num_components.append(n_components)
      for m in [1.1,1.3]:
        val_m.append(m)
        #topic detection
        svd = TruncatedSVD(n_components)
        data = svd.fit_transform(roberta_emb)
        initkm = KMeans(n_clusters=n_topics, n_init=1).fit(data)
        cntr, u = fcmeans(data.T, n_topics, m, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
        cluster_membership = np.argmax(u, axis=0)
          
        docs = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
        docs['Topic'] = cluster_membership
        docs['Doc_ID'] = range(len(docs))
        docs_per_topic = docs.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          
        tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
        top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, n_top_words)
          
        coherences.append(calculate_coherence(w2v_model, top_words))
        print('Number of Top Words : ',n_top_words)
        print('Number of Topics : ',n_topics)
        print('Number of n components: ',n_components)
        print('Value of m : ',m)
        print('Coherence value : ',calculate_coherence(w2v_model, top_words))
        print('--------------------------------------------------------------')

Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.3
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.1
Coherence value :  0.2557682666348086
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.3
Coherence value :  0.2637263418154584
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  10
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26520570978744046
--------------------------------------------------------------
Number of Top Words :  10


### Tuning Hyperparameter 3

In [None]:
## Model EFCM - Reduksi Dimensi
from sklearn.decomposition import TruncatedSVD

In [None]:
## Parameter Tuning
### Jumlah Topik Optimal
import pandas as pd

val_m=[]
num_components=[]
num_topics = []
num_top_words=[]
coherences = []
for n_top_words in [10,15,20]:
  num_top_words.append(n_top_words)
  for n_topics in [5,10,15,20]:
    num_topics.append(n_topics)
    for n_components in [3,5]:
      num_components.append(n_components)
      for m in [1.1,1.3]:
        val_m.append(m)
        #topic detection
        svd = TruncatedSVD(n_components)
        data = svd.fit_transform(roberta_emb)
        initkm = KMeans(n_clusters=n_topics, n_init=1).fit(data)
        cntr, u = fcmeans(data.T, n_topics, m, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
        cluster_membership = np.argmax(u, axis=0)
          
        docs = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
        docs['Topic'] = cluster_membership
        docs['Doc_ID'] = range(len(docs))
        docs_per_topic = docs.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          
        tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
        top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, n_top_words)
          
        coherences.append(calculate_coherence(w2v_model, top_words))
        print('Number of Top Words : ',n_top_words)
        print('Number of Topics : ',n_topics)
        print('Number of n components: ',n_components)
        print('Value of m : ',m)
        print('Coherence value : ',calculate_coherence(w2v_model, top_words))
        print('--------------------------------------------------------------')

Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.3
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.1
Coherence value :  0.25576826663480867
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.3
Coherence value :  0.2524867153002156
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  10
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26520570978744046
--------------------------------------------------------------
Number of Top Words :  10

### Tuning Hyperparameter 4

In [None]:
## Model EFCM - Reduksi Dimensi
from sklearn.decomposition import TruncatedSVD

In [None]:
## Parameter Tuning
### Jumlah Topik Optimal
import pandas as pd

val_m=[]
num_components=[]
num_topics = []
num_top_words=[]
coherences = []
for n_top_words in [10,15,20]:
  num_top_words.append(n_top_words)
  for n_topics in [5,10,15,20]:
    num_topics.append(n_topics)
    for n_components in [3,5]:
      num_components.append(n_components)
      for m in [1.1,1.3]:
        val_m.append(m)
        #topic detection
        svd = TruncatedSVD(n_components)
        data = svd.fit_transform(roberta_emb)
        initkm = KMeans(n_clusters=n_topics, n_init=1).fit(data)
        cntr, u = fcmeans(data.T, n_topics, m, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
        cluster_membership = np.argmax(u, axis=0)
          
        docs = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
        docs['Topic'] = cluster_membership
        docs['Doc_ID'] = range(len(docs))
        docs_per_topic = docs.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          
        tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
        top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, n_top_words)
          
        coherences.append(calculate_coherence(w2v_model, top_words))
        print('Number of Top Words : ',n_top_words)
        print('Number of Topics : ',n_topics)
        print('Number of n components: ',n_components)
        print('Value of m : ',m)
        print('Coherence value : ',calculate_coherence(w2v_model, top_words))
        print('--------------------------------------------------------------')

Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.3
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.1
Coherence value :  0.26671126814559104
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.3
Coherence value :  0.2524867153002156
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  10
Number of n components:  3
Value of m :  1.1
Coherence value :  0.2652057097874404
--------------------------------------------------------------
Number of Top Words :  10


### Tuning Hyperparameter 5

In [None]:
## Model EFCM - Reduksi Dimensi
from sklearn.decomposition import TruncatedSVD

In [None]:
## Parameter Tuning
### Jumlah Topik Optimal
import pandas as pd

val_m=[]
num_components=[]
num_topics = []
num_top_words=[]
coherences = []
for n_top_words in [10,15,20]:
  num_top_words.append(n_top_words)
  for n_topics in [5,10,15,20]:
    num_topics.append(n_topics)
    for n_components in [3,5]:
      num_components.append(n_components)
      for m in [1.1,1.3]:
        val_m.append(m)
        #topic detection
        svd = TruncatedSVD(n_components)
        data = svd.fit_transform(roberta_emb)
        initkm = KMeans(n_clusters=n_topics, n_init=1).fit(data)
        cntr, u = fcmeans(data.T, n_topics, m, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
        cluster_membership = np.argmax(u, axis=0)
          
        docs = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
        docs['Topic'] = cluster_membership
        docs['Doc_ID'] = range(len(docs))
        docs_per_topic = docs.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          
        tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
        top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, n_top_words)
          
        coherences.append(calculate_coherence(w2v_model, top_words))
        print('Number of Top Words : ',n_top_words)
        print('Number of Topics : ',n_topics)
        print('Number of n components: ',n_components)
        print('Value of m : ',m)
        print('Coherence value : ',calculate_coherence(w2v_model, top_words))
        print('--------------------------------------------------------------')

Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.3
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.1
Coherence value :  0.25576826663480867
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.3
Coherence value :  0.2524867153002156
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  10
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26520570978744046
--------------------------------------------------------------
Number of Top Words :  10

### Tuning Hyperparameter 6

In [None]:
## Model EFCM - Reduksi Dimensi
from sklearn.decomposition import TruncatedSVD

In [None]:
## Parameter Tuning
### Jumlah Topik Optimal
import pandas as pd

val_m=[]
num_components=[]
num_topics = []
num_top_words=[]
coherences = []
for n_top_words in [10,15,20]:
  num_top_words.append(n_top_words)
  for n_topics in [5,10,15,20]:
    num_topics.append(n_topics)
    for n_components in [3,5]:
      num_components.append(n_components)
      for m in [1.1,1.3]:
        val_m.append(m)
        #topic detection
        svd = TruncatedSVD(n_components)
        data = svd.fit_transform(roberta_emb)
        initkm = KMeans(n_clusters=n_topics, n_init=1).fit(data)
        cntr, u = fcmeans(data.T, n_topics, m, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
        cluster_membership = np.argmax(u, axis=0)
          
        docs = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
        docs['Topic'] = cluster_membership
        docs['Doc_ID'] = range(len(docs))
        docs_per_topic = docs.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          
        tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
        top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, n_top_words)
          
        coherences.append(calculate_coherence(w2v_model, top_words))
        print('Number of Top Words : ',n_top_words)
        print('Number of Topics : ',n_topics)
        print('Number of n components: ',n_components)
        print('Value of m : ',m)
        print('Coherence value : ',calculate_coherence(w2v_model, top_words))
        print('--------------------------------------------------------------')

Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.3
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.1
Coherence value :  0.25576826663480867
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.3
Coherence value :  0.2524867153002156
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  10
Number of n components:  3
Value of m :  1.1
Coherence value :  0.2479309798584926
--------------------------------------------------------------
Number of Top Words :  10


### Tuning Hyperparameter 7

In [None]:
## Model EFCM - Reduksi Dimensi
from sklearn.decomposition import TruncatedSVD

In [None]:
## Parameter Tuning
### Jumlah Topik Optimal
import pandas as pd

val_m=[]
num_components=[]
num_topics = []
num_top_words=[]
coherences = []
for n_top_words in [10,15,20]:
  num_top_words.append(n_top_words)
  for n_topics in [5,10,15,20]:
    num_topics.append(n_topics)
    for n_components in [3,5]:
      num_components.append(n_components)
      for m in [1.1,1.3]:
        val_m.append(m)
        #topic detection
        svd = TruncatedSVD(n_components)
        data = svd.fit_transform(roberta_emb)
        initkm = KMeans(n_clusters=n_topics, n_init=1).fit(data)
        cntr, u = fcmeans(data.T, n_topics, m, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
        cluster_membership = np.argmax(u, axis=0)
          
        docs = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
        docs['Topic'] = cluster_membership
        docs['Doc_ID'] = range(len(docs))
        docs_per_topic = docs.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          
        tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
        top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, n_top_words)
          
        coherences.append(calculate_coherence(w2v_model, top_words))
        print('Number of Top Words : ',n_top_words)
        print('Number of Topics : ',n_topics)
        print('Number of n components: ',n_components)
        print('Value of m : ',m)
        print('Coherence value : ',calculate_coherence(w2v_model, top_words))
        print('--------------------------------------------------------------')

Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.3
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.1
Coherence value :  0.2557682666348086
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.3
Coherence value :  0.2637263418154584
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  10
Number of n components:  3
Value of m :  1.1
Coherence value :  0.2652057097874404
--------------------------------------------------------------
Number of Top Words :  10
N

### Tuning Hyperparameter 8

In [None]:
## Model EFCM - Reduksi Dimensi
from sklearn.decomposition import TruncatedSVD

In [None]:
## Parameter Tuning
### Jumlah Topik Optimal
import pandas as pd

val_m=[]
num_components=[]
num_topics = []
num_top_words=[]
coherences = []
for n_top_words in [10,15,20]:
  num_top_words.append(n_top_words)
  for n_topics in [5,10,15,20]:
    num_topics.append(n_topics)
    for n_components in [3,5]:
      num_components.append(n_components)
      for m in [1.1,1.3]:
        val_m.append(m)
        #topic detection
        svd = TruncatedSVD(n_components)
        data = svd.fit_transform(roberta_emb)
        initkm = KMeans(n_clusters=n_topics, n_init=1).fit(data)
        cntr, u = fcmeans(data.T, n_topics, m, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
        cluster_membership = np.argmax(u, axis=0)
          
        docs = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
        docs['Topic'] = cluster_membership
        docs['Doc_ID'] = range(len(docs))
        docs_per_topic = docs.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          
        tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
        top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, n_top_words)
          
        coherences.append(calculate_coherence(w2v_model, top_words))
        print('Number of Top Words : ',n_top_words)
        print('Number of Topics : ',n_topics)
        print('Number of n components: ',n_components)
        print('Value of m : ',m)
        print('Coherence value : ',calculate_coherence(w2v_model, top_words))
        print('--------------------------------------------------------------')

Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.3
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.1
Coherence value :  0.266711268145591
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.3
Coherence value :  0.2637263418154584
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  10
Number of n components:  3
Value of m :  1.1
Coherence value :  0.25519764366958825
--------------------------------------------------------------
Number of Top Words :  10
N

### Tuning Hyperparameter 9

In [None]:
## Model EFCM - Reduksi Dimensi
from sklearn.decomposition import TruncatedSVD

In [None]:
## Parameter Tuning
### Jumlah Topik Optimal
import pandas as pd

val_m=[]
num_components=[]
num_topics = []
num_top_words=[]
coherences = []
for n_top_words in [10,15,20]:
  num_top_words.append(n_top_words)
  for n_topics in [5,10,15,20]:
    num_topics.append(n_topics)
    for n_components in [3,5]:
      num_components.append(n_components)
      for m in [1.1,1.3]:
        val_m.append(m)
        #topic detection
        svd = TruncatedSVD(n_components)
        data = svd.fit_transform(roberta_emb)
        initkm = KMeans(n_clusters=n_topics, n_init=1).fit(data)
        cntr, u = fcmeans(data.T, n_topics, m, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
        cluster_membership = np.argmax(u, axis=0)
          
        docs = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
        docs['Topic'] = cluster_membership
        docs['Doc_ID'] = range(len(docs))
        docs_per_topic = docs.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          
        tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
        top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, n_top_words)
          
        coherences.append(calculate_coherence(w2v_model, top_words))
        print('Number of Top Words : ',n_top_words)
        print('Number of Topics : ',n_topics)
        print('Number of n components: ',n_components)
        print('Value of m : ',m)
        print('Coherence value : ',calculate_coherence(w2v_model, top_words))
        print('--------------------------------------------------------------')

Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.3
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.1
Coherence value :  0.266711268145591
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.3
Coherence value :  0.2637263418154584
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  10
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26520570978744046
--------------------------------------------------------------
Number of Top Words :  10
N

### Tuning Hyperparameter 10

In [None]:
## Model EFCM - Reduksi Dimensi
from sklearn.decomposition import TruncatedSVD

In [None]:
## Parameter Tuning
### Jumlah Topik Optimal
import pandas as pd

val_m=[]
num_components=[]
num_topics = []
num_top_words=[]
coherences = []
for n_top_words in [10,15,20]:
  num_top_words.append(n_top_words)
  for n_topics in [5,10,15,20]:
    num_topics.append(n_topics)
    for n_components in [3,5]:
      num_components.append(n_components)
      for m in [1.1,1.3]:
        val_m.append(m)
        #topic detection
        svd = TruncatedSVD(n_components)
        data = svd.fit_transform(roberta_emb)
        initkm = KMeans(n_clusters=n_topics, n_init=1).fit(data)
        cntr, u = fcmeans(data.T, n_topics, m, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
        cluster_membership = np.argmax(u, axis=0)
          
        docs = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
        docs['Topic'] = cluster_membership
        docs['Doc_ID'] = range(len(docs))
        docs_per_topic = docs.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          
        tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
        top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, n_top_words)
          
        coherences.append(calculate_coherence(w2v_model, top_words))
        print('Number of Top Words : ',n_top_words)
        print('Number of Topics : ',n_topics)
        print('Number of n components: ',n_components)
        print('Value of m : ',m)
        print('Coherence value : ',calculate_coherence(w2v_model, top_words))
        print('--------------------------------------------------------------')

Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26960401886275837
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  3
Value of m :  1.3
Coherence value :  0.2696040188627583
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.1
Coherence value :  0.25576826663480867
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  5
Number of n components:  5
Value of m :  1.3
Coherence value :  0.2637263418154584
--------------------------------------------------------------
Number of Top Words :  10
Number of Topics :  10
Number of n components:  3
Value of m :  1.1
Coherence value :  0.26520570978744046
--------------------------------------------------------------
Number of Top Words :  10


### Clustering

In [None]:
svd = TruncatedSVD(5)
data = svd.fit_transform(roberta_emb)
initkm = KMeans(n_clusters=20, n_init=1).fit(data)
cntr, u = fcmeans(data.T, 20, 1.1, error=0.0001, maxiter=200, init=initkm.cluster_centers_.T)
cluster_membership = np.argmax(u, axis=0)


docs1 = pd.DataFrame(list(dataset["tweets"]), columns=["Doc"])
docs1['Topic'] = cluster_membership
docs1['Doc_ID'] = range(len(docs))
docs_per_topic = docs1.dropna(subset=['Doc']).groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
        
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs))
top_words = extract_top_words_per_topic(tf_idf, count, docs_per_topic, 15)
topic_sizes = extract_topic_sizes(docs1)

print(top_words)
print(topic_sizes)

[['chat', 've', 'let', 'know', 're', 'valentine', 'google', 'via', 'bing', 'asked', 'day', 'like', 'use', 'write', 'prompt'], ['chat', 'asked', 'like', 'question', 'think', 'know', 'make', 'answer', 'write', 'one', 'would', 'use', 'people', 'could', 'need'], ['chat', 'artificialintelligence', 'asked', 'write', 'midjourney', 'like', 'know', 'could', 'google', 'via', 'say', 'people', 'aiart', 'think', 'become'], ['artificialintelligence', 'nft', 'technology', 'machinelearning', 'microsoft', 'tech', 'chatbot', 'airdrop', 'google', 'gpt', 'crypto', 'fintech', 'web3', 'cybersecurity', 'via'], ['amp', 'google', 'new', 'tech', 'bard', 're', 'chat', 'future', 'nft', 'world', 'miss', 'technology', 'artificialintelligence', 'microsoft', 'one'], ['chat', 'using', 'use', 'gpt', 'ask', 'make', 'google', 'check', 'gpt4', 'new', 'write', 'get', 'via', 'tool', 'question'], ['chat', 'gt', 'use', 'ask', 'love', 'mem', 'thought', 'gpt', 'try', 'using', 'written', 'save', 'friend', 'best', 'thread'], ['am

In [None]:
### Menentukan Nilai Coherence dari Topik
print(calculate_coherence(w2v_model, top_words))

0.297558943404296


In [None]:
dataset_prediksi_topik=pd.read_csv(r"/content/dataset_prediksi_with_sentiment.csv")

In [None]:
dataset_prediksi_topik['Topic']=docs1['Topic']

In [None]:
dataset_prediksi_topik

Unnamed: 0.1,Unnamed: 0,tweets,Sentiment,Topic
0,0,3 mind-blowing chat hack small youtubers ! ! t...,0,3
1,1,expert exploring system leverage large languag...,0,7
2,2,digital marketing practice promoting product s...,0,3
3,3,( new york post ) : buzzfeed stock surge plan ...,0,11
4,4,"hey , look made",0,5
...,...,...,...,...
9988,9995,"anonymizing user conscious choice , case legit...",1,16
9989,9996,great video kamikazecash ugly truth,1,19
9990,9997,chatbots transform teaching ? super interestin...,1,1
9991,9998,rather trying answer every question itselfsome...,0,7


In [None]:
from google.colab import files
dataset_prediksi.to_csv('dataset_prediksi_topik.csv', encoding = 'utf-8-sig') 
files.download('dataset_prediksi_topik.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>