In [44]:
import numpy as np
import pandas as pd

In [45]:
df = pd.read_csv('data/tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


## CaseFolding And Filtering

In [46]:
# lowercase all text
df['content'] = df['content'].str.lower()

#remove @usernames
df['content'] = df['content'].str.replace('@[^\s]+',repl='',regex=True)

#remove urls
df['content'] = df['content'].str.replace('http\S+|www.\S+',repl='',regex=True)

#remove non-ascii characters
df['content'] = df['content'].str.replace('[^\x00-\x7F]+',repl='',regex=True)

#remove numbers
df['content'] = df['content'].str.replace('\d+',repl='',regex=True)

#remove punctuation
df['content'] = df['content'].str.replace('[^\w\s]','',regex=True)

#save to csv
df.to_csv('data/tweet_emotions_clean.csv',index=False)

## Tokenizing

In [47]:
#tokenize library
from nltk.tokenize import word_tokenize

df = pd.read_csv('data/tweet_emotions_clean.csv')

#change to string
df['content'] = df['content'].astype(str)

#tokenize
df['content'] = df['content'].apply(word_tokenize)

#save to csv
df.to_csv('data/tweet_emotions_tokens.csv',index=False)

## Stemming

In [48]:
from nltk.stem import PorterStemmer

df = pd.read_csv('data/tweet_emotions_tokens.csv')

stemmer=PorterStemmer()

sentences = df['content'].tolist()

for i in range(len(sentences)):
    words=word_tokenize(sentences[i])
    #List comprehension
    words=[stemmer.stem(word) for word in words]
    sentences[i]=' '.join(words)

#merge back to dataframe
df['content'] = sentences

#save to csv
df.to_csv('data/tweet_emotions_stemmed.csv',index=False)

# Feature Extraction

In [49]:
df = pd.read_csv('data/tweet_emotions_stemmed.csv')

#INSPEKSI DATA 

# Cek Kelengkapan Data
print(df.info())
print('\n')

# Cek Statistik Deskriptif
print (df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB
None


           tweet_id
count  4.000000e+04
mean   1.845184e+09
std    1.188579e+08
min    1.693956e+09
25%    1.751431e+09
50%    1.855443e+09
75%    1.962781e+09
max    1.966441e+09


In [50]:
#mengubah data kategorikal menjadi numerik
new_labels = {
    'anger': 0,
    'boredom': 1,
    'empty': 2,
    'enthusiasm': 3,
    'fun': 4,
    'happiness': 5,
    'hate': 6,
    'love': 7,
    'neutral': 8,
    'relief': 9,
    'sadness': 10,
    'surprise': 11,
    'worry': 12
}

#Encode labels
df['sentiment'] = df['sentiment'].map(new_labels)

#Cek kembali data
print(df.head())

     tweet_id  sentiment                                            content
0  1956967341          2  [ ' i ' , 'know ' , ' i ' , 'wa ' , 'listenin ...
1  1956967666         10  [ 'layin ' , 'n ' , 'bed ' , 'with ' , ' a ' ,...
2  1956967696         10       [ 'funer ' , 'ceremonygloomi ' , 'friday ' ]
3  1956967789          3  [ 'want ' , 'to ' , 'hang ' , 'out ' , 'with '...
4  1956968416          8  [ 'we ' , 'want ' , 'to ' , 'trade ' , 'with '...


In [51]:
#Split data
x = df['content']
y = df['sentiment']

In [56]:
#Ekstraksi fitur
from sklearn.feature_extraction.text import CountVectorizer

#Split Data Training dan Data Testing
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)

#Mengubah data menjadi vektor
cv = CountVectorizer()
x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

In [57]:
#Training And Evaluating Model
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score,classification_report

#Multinomial Naive Bayes
model = MultinomialNB()

#Training
model.fit(x_train, y_train)

#prediksi data testing
y_pred_test = model.predict(x_test)

#prediksi data training
y_pred_train = model.predict(x_train)  

# Evaluasi akurasi data training
acc_train = accuracy_score(y_train, y_pred_train)

# Evaluasi akurasi data training
acc_test = accuracy_score(y_test, y_pred_test)

# Menghitung Precission
precission = precision_score(y_test, y_pred_test, average='weighted',zero_division=0)

print(classification_report(y_test, y_pred_test, target_names=new_labels.keys(),zero_division=0))

# Print hasil evaluasi
print(f'Hasil akurasi data train: {acc_train}')
print(f'Hasil akurasi data test: {acc_test}')
print(f'Hasil presisi data test: {precission}')

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        17
     boredom       0.00      0.00      0.00        30
       empty       0.00      0.00      0.00       174
  enthusiasm       0.00      0.00      0.00       153
         fun       0.00      0.00      0.00       347
   happiness       0.35      0.32      0.33      1019
        hate       0.60      0.01      0.02       259
        love       0.50      0.31      0.38       732
     neutral       0.35      0.42      0.38      1774
      relief       0.00      0.00      0.00       306
     sadness       0.31      0.14      0.20       995
    surprise       0.17      0.00      0.00       468
       worry       0.30      0.68      0.41      1726

    accuracy                           0.33      8000
   macro avg       0.20      0.15      0.13      8000
weighted avg       0.30      0.33      0.28      8000

Hasil akurasi data train: 0.512
Hasil akurasi data test: 0.3285
Hasil presisi d