In [1]:
import pandas as pd
import numpy as np
import json
import nltk
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.utils import compute_class_weight
from tensorflow.keras.models import Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
import random
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

In [2]:
df_train = pd.read_excel('dataset.xlsx',sheet_name='main')
df_test = pd.read_excel('dataset.xlsx',sheet_name='test')
df_user_response = pd.read_excel('dataset.xlsx',sheet_name='Doni')

In [3]:
df_user_response.head()

Unnamed: 0,response,intent
0,"Hi, senang bertemu denganmu",CasualGreeting
1,halo juga,CasualGreeting
2,Hai,CasualGreeting
3,halo,CasualGreeting
4,Hai juga,CasualGreeting


In [4]:
list_intents = df_user_response['intent'].unique().tolist()
dict_response = {}

for list_intent in list_intents:
    dict_response[list_intent] = df_user_response[df_user_response['intent'] == list_intent]['response'].values.tolist()
    

## Preprocessing

In [5]:
# lower case
# remove digit
# remove symbol
# tokenization
# stemming
# repair slangwords

In [6]:
def preprocessing(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
        
    with open('stopwords.json') as f:
        stopwords=json.load(f)
    
    with open('slangwords.json') as f:
        slangwords=json.load(f)
        
    punctuation = string.punctuation + '’'
    # lowercase
    text = text.lower()
    # hapus angka
    text = text.translate(str.maketrans(string.digits,' '*len(string.digits)))
    # hapus tanda baca
    text = text.translate(str.maketrans(punctuation,' '*len(punctuation)))
    # hapus whitespace
    text = text.strip(" ")
    
    # repair slangword
    slang = []
    formal = []
    
    for slangword in slangwords['slang'].items():
        slang.append(slangword[1])

    for formalword in slangwords['formal'].items():
        formal.append(formalword[1])
    text = text.split()
    for index,item in enumerate(text):
        if item in slang:
            slang_index = slang.index(item)
            text[index] = formal[slang_index]
    text = " ".join(text)
    
    # remove stopwords
    if type(text) == str:
        text = text.split()
    text = [word for word in text if word not in stopwords]
    text = " ".join(text)

    # stemming
    text = stemmer.stem(text)
        
    # tokenization
#     text = text.split()
    
    return text

In [7]:
df_train['preprocessed'] = df_train['text'].apply(lambda x:preprocessing(x))
df_test['preprocessed'] = df_test['text'].apply(lambda x:preprocessing(x))

In [8]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer = tfidf_vectorizer.fit(df_train['preprocessed'])

with open('tfidf.pickle', 'wb') as tfidf:
    pickle.dump(tfidf_vectorizer, tfidf)

doc_vec_train = tfidf_vectorizer.transform(df_train['preprocessed'])
doc_vec_test = tfidf_vectorizer.transform(df_test['preprocessed'])

In [9]:
df2_train = pd.DataFrame(doc_vec_train.toarray().transpose(),
                   index=tfidf_vectorizer.get_feature_names_out())

df2_test = pd.DataFrame(doc_vec_test.toarray().transpose(),
                   index=tfidf_vectorizer.get_feature_names_out())

In [10]:
le = LabelEncoder()

In [11]:
le = le.fit(df_train['intent'])
with open('label_encoder.pickle', 'wb') as label_encoder:
    pickle.dump(le, label_encoder)
    
y_train = le.transform(df_train['intent'])
y_test = le.transform(df_test['intent'])

In [12]:
X_train = df2_train.transpose()
X_test = df2_test.transpose()
# y_train = df_train['intent']
# y_test = df_test['intent']

In [13]:
X_test.head()

Unnamed: 0,assalamualaikum,bye,domisili,favorite,halo,hello,hi,hobi,jumpa,kenal,makan,malam,mana,nama,pagi,rumah,selamat,siang,sih,sore,suka,tau,temu,there,tinggal,wabarakatuh,warahmatullahi,wei,wey,woi,woy,you
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.660211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.63804,0.0,0.0,0.770003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
print(f'the number of rows of training data: {X_train.shape[0]}, the number of columns of training data: {X_train.shape[1]}')
print(f'the number of rows of testing data: {X_test.shape[0]}, the number of columns of testing data: {X_test.shape[1]}')

the number of rows of training data: 41, the number of columns of training data: 32
the number of rows of testing data: 11, the number of columns of testing data: 32


In [15]:
modelrf = RandomForestClassifier(class_weight='balanced',random_state=10)
modelrf.fit(X_train, y_train)

## save model
with open('model_rf.pickle', 'wb') as model:
    pickle.dump(modelrf, model)

In [16]:
pred = modelrf.predict(X_test)

In [17]:
print(classification_report(y_test, pred, target_names=np.unique(le.inverse_transform(y_train))))

                   precision    recall  f1-score   support

AfternoonGreeting       1.00      1.00      1.00         1
   CasualGreeting       1.00      1.00      1.00         1
  EveningGreeting       1.00      1.00      1.00         1
     FavoriteFood       1.00      1.00      1.00         1
          GoodBye       1.00      1.00      1.00         1
            Hobby       1.00      1.00      1.00         1
     Introduction       1.00      1.00      1.00         1
  IslamicGreeting       1.00      1.00      1.00         1
  MorningGreeting       1.00      1.00      1.00         1
    NightGreeting       1.00      1.00      1.00         1
        Residence       1.00      1.00      1.00         1

         accuracy                           1.00        11
        macro avg       1.00      1.00      1.00        11
     weighted avg       1.00      1.00      1.00        11



In [23]:
while True:
    texts_p = []
    prediction_input = input('You : ')

    #removing punctuation and converting to lowercase
    prediction_input = preprocessing(prediction_input)
    texts_p.append(prediction_input)
    tf_idf = tfidf_vectorizer.transform(texts_p)
    output = modelrf.predict_proba(tf_idf.toarray())[0]
    output_index = np.argmax(output)
#     print(max(output))
    if output[output_index] < 0.63:
        print('Bot : Maaf saya tidak mengerti apa maksud kamu')
        break
    #finding the right tag and predicting
    response_tag = np.unique(le.inverse_transform(y_train))[output_index]
    print("Bot : ",random.choice(dict_response[response_tag]))
    if response_tag == "GoodBye":
        break

You : kenalan dong
Bot :  kamu bisa panggil aku Doni
You : suka makan apa
Bot :  aku suka ayam goreng
You : ikan suka 
Bot : Maaf saya tidak mengerti apa maksud kamu
