In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
full_df = pd.read_csv("/content/drive/MyDrive/Text Classification 2023/DataKratomFinal1.csv", nrows=5000)
df = full_df[["abstrak"]]
df["abstrak"] = df["abstrak"].astype(str)
full_df.head()

Unnamed: 0,judul,abstrak,label
0,Evaluation of toxicity profile of kratom (Mitr...,"Mitragyna speciosa Korth also known as kratom,...",dampak
1,Biosynthesis of kratom opioids,"Mitragynine, an analgesic alkaloid from the pl...",manfaat
2,Examining the Psychoactive Differences between...,Kratom (Mitragyna speciosa) is a Southeast Asi...,dampak
3,Description of Kratom Exposure Events in Wisco...,BACKGROUND: Consumption of kratom (Mitragyna s...,dampak
4,"Kratom Alkaloids, Cannabinoids, and Chronic Pa...",Introduction: Chronic neuropathic pain is as a...,manfaat


In [None]:
df["abstrak"] = df["abstrak"].str.lower()
df.head()

Unnamed: 0,abstrak
0,"mitragyna speciosa korth also known as kratom,..."
1,"mitragynine, an analgesic alkaloid from the pl..."
2,kratom (mitragyna speciosa) is a southeast asi...
3,background: consumption of kratom (mitragyna s...
4,introduction: chronic neuropathic pain is as a...


In [None]:
# drop the new column created in last cell
#df.drop(["text_lower"], axis=1, inplace=True)

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["abstrak"] = df["abstrak"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,abstrak
0,mitragyna speciosa korth also known as kratom ...
1,mitragynine an analgesic alkaloid from the pla...
2,kratom mitragyna speciosa is a southeast asian...
3,background consumption of kratom mitragyna spe...
4,introduction chronic neuropathic pain is as a ...


In [None]:
# ------ Tokenizing ---------

def remove_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9_]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ");

#KRATOM_DATA['abstrak'] = KRATOM_DATA['abstrak'].apply(remove_tweet_special)

df["abstrak"] = df["abstrak"].apply(lambda text: remove_special(text))
df.head()

Unnamed: 0,abstrak
0,mitragyna speciosa korth also known as kratom ...
1,mitragynine an analgesic alkaloid from the pla...
2,kratom mitragyna speciosa is a southeast asian...
3,background consumption of kratom mitragyna spe...
4,introduction chronic neuropathic pain is as a ...


In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_wo_stop"] = df["abstrak"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,abstrak,text_wo_stop
0,mitragyna speciosa korth also known as kratom ...,mitragyna speciosa korth also known kratom her...
1,mitragynine an analgesic alkaloid from the pla...,mitragynine analgesic alkaloid plant mitragyna...
2,kratom mitragyna speciosa is a southeast asian...,kratom mitragyna speciosa southeast asian plan...
3,background consumption of kratom mitragyna spe...,background consumption kratom mitragyna specio...
4,introduction chronic neuropathic pain is as a ...,introduction chronic neuropathic pain severe d...


In [None]:
from collections import Counter
cnt = Counter()
for text in df["text_wo_stop"].values:
    for word in text.split():
        cnt[word] += 1

cnt.most_common(10)

[('kratom', 1860),
 ('use', 743),
 ('mitragynine', 719),
 ('effects', 530),
 ('speciosa', 433),
 ('opioid', 424),
 ('mitragyna', 337),
 ('used', 307),
 ('withdrawal', 286),
 ('study', 260)]

In [None]:
from nltk.tokenize import word_tokenize
def word_tokenize_wrapper(text):
  return word_tokenize(text)

df["text_wrapped"] = df["text_wo_stop"].apply(lambda text: word_tokenize_wrapper(text))
df.head()

Unnamed: 0,abstrak,text_wo_stop,text_wrapped
0,mitragyna speciosa korth also known as kratom ...,mitragyna speciosa korth also known kratom her...,"[mitragyna, speciosa, korth, also, known, krat..."
1,mitragynine an analgesic alkaloid from the pla...,mitragynine analgesic alkaloid plant mitragyna...,"[mitragynine, analgesic, alkaloid, plant, mitr..."
2,kratom mitragyna speciosa is a southeast asian...,kratom mitragyna speciosa southeast asian plan...,"[kratom, mitragyna, speciosa, southeast, asian..."
3,background consumption of kratom mitragyna spe...,background consumption kratom mitragyna specio...,"[background, consumption, kratom, mitragyna, s..."
4,introduction chronic neuropathic pain is as a ...,introduction chronic neuropathic pain severe d...,"[introduction, chronic, neuropathic, pain, sev..."


In [None]:
from nltk.corpus import stopwords

list_stopwords = stopwords.words('english')


list_stopwords.extend(['mitragyna','speciosa','use','used','use','method','results','plant',
                       'discussion','conclusion','background','methods','introduction',
                       'southeast','asian','PubMed','electronic','database','aims','kratom',
                       'main','findings','key','ETHNOPHARMACOLOGICAL','RELEVANCE','materials',
                       'botanical','natural','product','coffee','plants','medicinal','purpose',
                       'review','products','recent','summary','up-to-date','worldwide','paper',
                       'thai','native','breed','evergreen','native','case','representation',
                       'year','old','thai','thailand','man','friend','coca-cola','psychoactive',
                       'tree','genus','tea','objectives','FDA','food','drug','association',
                       'individual','family','society','powder','capsule','US','United','States',
                       'logistic','regression','model','study','Rubiaceae','ketum','Peninsular',
                       'Malaysia','data','sources','Areca catechu L., Argemone Mexicana L., Citrus aurantium L., Eurycoma longifolia Jack., Lepidium meyenii Walp., Mitragyna speciosa Korth., Panax ginseng C. A. Mey, Panax quinquefolius L., Pausinystalia johimbe (K. Schum.) Pierre ex Beille, Piper methysticum G. Forst., Ptychopetalum olacoides Benth., Sceletium tortuosum (L.) N. E. Brown, Turnera diffusa Willd. ex. Schult., Voacanga africana Stapf ex Scott-Elliot, and Withania somnifera (L.) Dunal'])

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [text for text in words if text not in list_stopwords]

df["text_stopword"] = df["text_wrapped"].apply(lambda text: stopwords_removal(text))
df.head()

Unnamed: 0,abstrak,text_wo_stop,text_wrapped,text_stopword
0,mitragyna speciosa korth also known as kratom ...,mitragyna speciosa korth also known kratom her...,"[mitragyna, speciosa, korth, also, known, krat...","[korth, also, known, herbal, preparation, ther..."
1,mitragynine an analgesic alkaloid from the pla...,mitragynine analgesic alkaloid plant mitragyna...,"[mitragynine, analgesic, alkaloid, plant, mitr...","[mitragynine, analgesic, alkaloid, offers, saf..."
2,kratom mitragyna speciosa is a southeast asian...,kratom mitragyna speciosa southeast asian plan...,"[kratom, mitragyna, speciosa, southeast, asian...","[containing, various, alkaloids, induce, pharm..."
3,background consumption of kratom mitragyna spe...,background consumption kratom mitragyna specio...,"[background, consumption, kratom, mitragyna, s...","[consumption, herbal, substance, result, adver..."
4,introduction chronic neuropathic pain is as a ...,introduction chronic neuropathic pain severe d...,"[introduction, chronic, neuropathic, pain, sev...","[chronic, neuropathic, pain, severe, detriment..."


In [None]:
def untokenize(document):
    text = ' '.join(document)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()

df["text_untokenize"] = df["text_stopword"].apply(untokenize)
df.head()



Unnamed: 0,abstrak,text_wo_stop,text_wrapped,text_stopword,text_untokenize
0,mitragyna speciosa korth also known as kratom ...,mitragyna speciosa korth also known kratom her...,"[mitragyna, speciosa, korth, also, known, krat...","[korth, also, known, herbal, preparation, ther...",korth also known herbal preparation therapeuti...
1,mitragynine an analgesic alkaloid from the pla...,mitragynine analgesic alkaloid plant mitragyna...,"[mitragynine, analgesic, alkaloid, plant, mitr...","[mitragynine, analgesic, alkaloid, offers, saf...",mitragynine analgesic alkaloid offers safer al...
2,kratom mitragyna speciosa is a southeast asian...,kratom mitragyna speciosa southeast asian plan...,"[kratom, mitragyna, speciosa, southeast, asian...","[containing, various, alkaloids, induce, pharm...",containing various alkaloids induce pharmacolo...
3,background consumption of kratom mitragyna spe...,background consumption kratom mitragyna specio...,"[background, consumption, kratom, mitragyna, s...","[consumption, herbal, substance, result, adver...",consumption herbal substance result adverse he...
4,introduction chronic neuropathic pain is as a ...,introduction chronic neuropathic pain severe d...,"[introduction, chronic, neuropathic, pain, sev...","[chronic, neuropathic, pain, severe, detriment...",chronic neuropathic pain severe detriment over...


In [None]:
#Stemming

from nltk.stem.porter import PorterStemmer

# Drop the two columns
#df.drop(["text_wo_stopfreq", "text_wo_stopfreqrare"], axis=1, inplace=True)

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text_normal"] = df["text_untokenize"].apply(lambda text: stem_words(text))
df.head()

Unnamed: 0,abstrak,text_wo_stop,text_wrapped,text_stopword,text_untokenize,text_stemmed,text_normal
0,mitragyna speciosa korth also known as kratom ...,mitragyna speciosa korth also known kratom her...,"[mitragyna, speciosa, korth, also, known, krat...","[korth, also, known, herbal, preparation, ther...",korth also known herbal preparation therapeuti...,korth also known herbal prepar therapeut prope...,korth also known herbal prepar therapeut prope...
1,mitragynine an analgesic alkaloid from the pla...,mitragynine analgesic alkaloid plant mitragyna...,"[mitragynine, analgesic, alkaloid, plant, mitr...","[mitragynine, analgesic, alkaloid, offers, saf...",mitragynine analgesic alkaloid offers safer al...,mitragynin analges alkaloid offer safer altern...,mitragynin analges alkaloid offer safer altern...
2,kratom mitragyna speciosa is a southeast asian...,kratom mitragyna speciosa southeast asian plan...,"[kratom, mitragyna, speciosa, southeast, asian...","[containing, various, alkaloids, induce, pharm...",containing various alkaloids induce pharmacolo...,contain variou alkaloid induc pharmacolog effe...,contain variou alkaloid induc pharmacolog effe...
3,background consumption of kratom mitragyna spe...,background consumption kratom mitragyna specio...,"[background, consumption, kratom, mitragyna, s...","[consumption, herbal, substance, result, adver...",consumption herbal substance result adverse he...,consumpt herbal substanc result advers health ...,consumpt herbal substanc result advers health ...
4,introduction chronic neuropathic pain is as a ...,introduction chronic neuropathic pain severe d...,"[introduction, chronic, neuropathic, pain, sev...","[chronic, neuropathic, pain, severe, detriment...",chronic neuropathic pain severe detriment over...,chronic neuropath pain sever detriment overal ...,chronic neuropath pain sever detriment overal ...


In [None]:
import shutil

namaFile = "DataKratomFinal1.csv"
df['text_normal'].to_csv(namaFile)
original = namaFile
target = r'/content/drive/MyDrive/Text Classification 2023/'+namaFile
shutil.copyfile(original, target)

'/content/drive/MyDrive/Text Classification 2023/DataKratomFinal1.csv'

In [None]:
namaFile = "DataKratomFinal4.csv"
full_df = pd.read_csv("/content/drive/MyDrive/Text Classification 2023/DataKratomFinal1.csv", nrows=5000)
full_df['text_normal'] = df['text_normal']
full_df.to_csv(namaFile)
full_df.head()
original = namaFile
target = r'/content/drive/MyDrive/Text Classification 2023/'+namaFile
shutil.copyfile(original, target)

'/content/drive/MyDrive/Text Classification 2023/DataKratomFinal4.csv'