# 1. Load Data

In [1]:
import pandas as pd

df = pd.read_csv("Twitter_Emotion_Dataset.csv")
df.head()

Unnamed: 0,label,tweet
0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu..."
1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi..."
2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...
3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng..."
4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata..."


# 2. Preprocessing

a. Casefolding

b. Hapus username, hashtag, url, dll

c. Hapus tandabaca, ganti spasi

d. Normalisasi kata -> kamus normalisasi, bikinan sendiri

e. [optional] hapus stop words

f. hapus sesuai konteks

In [2]:
import re,string
import pandas as pd
from stopwords_id import stop_words

# membaca file normalisasi
df_norm = pd.read_csv("normalisasi.txt")
# membuat kamus normalisasi (dictionary)
df_kamus = {}
for dt in df_norm.itertuples():
  df_kamus[dt[1]] = dt[2]

# kata-kata yang harus dihapus
word_to_remove = ['username','url']

def preprocess(row):
  # casefolding
  row['tweet'] = row['tweet'].lower()

  # hapus menghapus
  row['tweet'] = re.sub(r"(?:\@|#|\d)\S+","",row['tweet'])

  # ganti tanda baca jadi spasi
  row['tweet'] = row['tweet'].translate(str.maketrans(string.punctuation,' '*len(string.punctuation)))

  # normalisasi kata
  row['tweet'] = ' '.join([df_kamus[a] if a in df_kamus else a for a in row['tweet'].split()])

  # hapus stop words
  row['tweet'] = ' '.join([a for a in row['tweet'].split() if a not in stop_words()])

  # hapus kata tertentu
  row['tweet'] = ' '.join([a for a in row['tweet'].split() if a not in word_to_remove])


  return row

df_preprocess = df.apply(preprocess,axis=1)
df_preprocess.head()

Unnamed: 0,label,tweet
0,anger,jalan jatibaru polisi gertak gubernur emangny ...
1,anger,cewe lho kayaknya rasain sibuk jaga rasain sak...
2,happy,kepingin gudeg mbarek bu hj amad foto google s...
3,anger,jalan jatibaru wilayah tn abang pengaturan wil...
4,happy,sharing pengalaman aja kemarin jam batalin tik...


# 3. Most frequent words

In [3]:
from collections import Counter

total_kata = []
for dt in df_preprocess['tweet']:
  total_kata.extend(dt.split())

Counter(total_kata).most_common()

[('aja', 633),
 ('sayang', 464),
 ('cinta', 423),
 ('takut', 406),
 ('banget', 344),
 ('sih', 339),
 ('gitu', 276),
 ('tau', 268),
 ('gw', 268),
 ('ku', 259),
 ('pas', 257),
 ('suka', 248),
 ('bikin', 228),
 ('anak', 224),
 ('hati', 213),
 ('gua', 213),
 ('salah', 207),
 ('bgt', 201),
 ('si', 197),
 ('sampe', 196),
 ('semoga', 183),
 ('allah', 181),
 ('kasih', 168),
 ('kau', 167),
 ('pake', 163),
 ('waktu', 156),
 ('tuh', 154),
 ('emang', 153),
 ('jd', 152),
 ('biar', 150),
 ('bilang', 148),
 ('makan', 145),
 ('hidup', 143),
 ('jam', 141),
 ('temen', 141),
 ('gimana', 139),
 ('jalan', 138),
 ('jg', 138),
 ('udh', 138),
 ('org', 137),
 ('karna', 134),
 ('deh', 131),
 ('rumah', 129),
 ('lu', 128),
 ('sakit', 128),
 ('krn', 128),
 ('gini', 127),
 ('liat', 126),
 ('sm', 120),
 ('klo', 120),
 ('pengen', 118),
 ('tahun', 117),
 ('nih', 117),
 ('kayak', 114),
 ('lg', 111),
 ('iya', 111),
 ('pulang', 110),
 ('selamat', 110),
 ('indonesia', 109),
 ('dgn', 108),
 ('jatuh', 107),
 ('dr', 104),
 (

# 4. Topic Extraction

LSA,PLSA,LDA,CTM, dll

menggunakan LDA

In [6]:
pip install -U gensim

Collecting gensim
  Using cached gensim-3.8.3-cp38-cp38-win_amd64.whl (24.2 MB)
Collecting Cython==0.29.14
  Using cached Cython-0.29.14-cp38-cp38-win_amd64.whl (1.7 MB)
Installing collected packages: Cython, gensim
  Attempting uninstall: Cython
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an EnvironmentError: [WinError 5] Access is denied: 'c:\\programdata\\anaconda3\\lib\\site-packages\\__pycache__\\cython.cpython-38.pyc'
Consider using the `--user` option or check the permissions.



    Found existing installation: Cython 0.29.21
    Uninstalling Cython-0.29.21:


In [7]:
from gensim import corpora,models

total_kata = []
for dt in df_preprocess['tweet']:
  total_kata.append(dt.split())

# membuat dictionary kata (membuat nomor per kata)
dictionary = corpora.Dictionary(total_kata)

# membuat term frequency list
tf = [dictionary.doc2bow(a) for a in total_kata]

# membuat model LDA
lda = models.ldamodel.LdaModel(tf,num_topics=3,id2word=dictionary,random_state=123)

# tampilkan hasilnya
lda.top_topics(tf,topn=5)

[([(0.0088605, 'gua'),
   (0.007966899, 'aja'),
   (0.0069951024, 'banget'),
   (0.004868351, 'sih'),
   (0.0045054546, 'tau')],
  -2.250871998028218),
 ([(0.014940374, 'takut'),
   (0.008761115, 'aja'),
   (0.0063382178, 'cinta'),
   (0.005823795, 'sayang'),
   (0.005471642, 'gitu')],
  -2.9361122777166613),
 ([(0.01662363, 'cinta'),
   (0.014535222, 'sayang'),
   (0.0047626914, 'ku'),
   (0.0044132466, 'kasih'),
   (0.004263324, 'takut')],
  -3.0323482244529787)]

# 5. Classification

## 5.1. Split the data

training 
testing

In [8]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df_preprocess['tweet'],df_preprocess['label'],test_size=0.25,random_state=123)

## 5.2. feature extraction (vectorization)

menggunakan TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(x_train)
x_train_vec

<3300x14845 sparse matrix of type '<class 'numpy.float64'>'
	with 48876 stored elements in Compressed Sparse Row format>

## 5.3. Training

In [10]:
from sklearn.naive_bayes import MultinomialNB

cls = MultinomialNB()
cls.fit(x_train_vec,y_train)

MultinomialNB()

## 5.4. Testing

In [11]:
x_test_vec = vectorizer.transform(x_test)
labelpred = cls.predict(x_test_vec)

## 5.5. Model Evaluation

In [12]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

acc = accuracy_score(y_test,labelpred)
rep = classification_report(y_test,labelpred)
conf = confusion_matrix(y_test,labelpred)

print(acc)
print(rep)
print(conf)

0.5885558583106267
              precision    recall  f1-score   support

       anger       0.55      0.84      0.66       274
        fear       0.90      0.35      0.50       150
       happy       0.64      0.60      0.62       274
        love       0.86      0.49      0.62       154
     sadness       0.46      0.51      0.48       249

    accuracy                           0.59      1101
   macro avg       0.68      0.56      0.58      1101
weighted avg       0.64      0.59      0.58      1101

[[229   2  16   0  27]
 [ 46  52  15   4  33]
 [ 53   1 165   2  53]
 [ 10   1  29  75  39]
 [ 82   2  32   6 127]]


## 5.6. Predicting

In [13]:
text = "saya marah sekali jika tidak bisa makan"

text_vec = vectorizer.transform([text])
cls.predict(text_vec)

array(['anger'], dtype='<U7')