In [11]:
import re
import emoji
import string
import unicodedata
import nltk
from underthesea import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from collections import defaultdict
from vncorenlp import VnCoreNLP


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/quytien/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/quytien/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/quytien/nltk_data...


True

### 1. Lo·∫°i b·ªè k√Ω t·ª± ƒë·∫∑c bi·ªát, d·∫•u c√¢u, emoji 
#### *- Lo·∫°i b·ªè k√Ω t·ª± ƒë·∫∑c bi·ªát kh√¥ng xo√° c√°c k√Ω t·ª± ti·∫øng Vi·ªát (Unicode property): **r"[^\w\s√Ä-·ªπ]"*** <br /> *- Lo·∫°i b·ªè to√†n b·ªô k√Ω t·ª± ƒë·∫∑c bi·ªát, ch·ªâ gi·ªØ l·∫°i c√°c k√Ω t·ª± ASCII (ti·∫øng Anh kh√¥ng d·∫•u): **r"[^a-zA-Z0-9]"***

In [7]:
txt = "Ch·ªâ c·∫ß$n ki√™n nh@·∫´n r·ªìi m·ªçi t.h·ª© s·∫Ω ƒë·∫øn."

new_string = re.sub(r"[^\w\s√Ä-·ªπ]","",txt)
print(new_string)

Ch·ªâ c·∫ßn ki√™n nh·∫´n r·ªìi m·ªçi th·ª© s·∫Ω ƒë·∫øn


#### Lo·∫°i b·ªè d·∫•u c√¢u

In [15]:
text = "Cu·ªôc, s·ªëng kh√¥ng? ph·∫£i l√∫c n√†o; c≈©ng nh∆∞ m√¨nh: mu·ªën!"

new_string = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
print(new_string)

Cu·ªôc s·ªëng kh√¥ng ph·∫£i l√∫c n√†o c≈©ng nh∆∞ m√¨nh mu·ªën


#### Lo·∫°i b·ªè emoji

In [20]:
text = "H√¥m nay tr·ªùi ƒë·∫πp qu√°üòçüåû! ƒêi ch∆°i th√¥i üèñÔ∏èüöó"

new_string = emoji.replace_emoji(text, replace='')
print(new_string)

H√¥m nay tr·ªùi ƒë·∫πp qu√°! ƒêi ch∆°i th√¥i 


### 2.	Chu·∫©n h√≥a v√† t√°ch t·ª´ ti·∫øng Vi·ªát
#### T√°ch t·ª´ s·ª≠ d·ª•ng th∆∞ vi·ªán underthesea (h·ªó tr·ª£ version 10, 11)

In [2]:
txt = "V√† em lu√¥n tin sau c∆°n m∆∞a c·∫ßu v·ªìng s·∫Ω l·∫•p l√°nh."

tokens = word_tokenize(txt, format="text")
print(tokens)

V√† em lu√¥n tin sau c∆°n m∆∞a c·∫ßu v·ªìng s·∫Ω l·∫•p_l√°nh .


#### T√°ch t·ª´ s·ª≠ d·ª•ng th∆∞ vi·ªán VnCoreNLP

In [8]:
vncorenlp = VnCoreNLP(
    "/Users/quytien/VnCoreNLP/VnCoreNLP-1.2.jar", 
    annotators="wseg", 
    max_heap_size='-Xmx2g'
)
res = vncorenlp.tokenize(txt)
print(res)

[['V√†', 'em', 'lu√¥n', 'tin', 'sau', 'c∆°n', 'm∆∞a', 'c·∫ßu_v·ªìng', 's·∫Ω', 'l·∫•p_l√°nh', '.']]


#### Chu·∫©n ho√°

In [8]:
txt_1 = "V√†               em lu√¥n tin sau c∆°n m∆∞a c·∫ßu v·ªìng s·∫Ω l·∫•p l√°nh."
# Chu·∫©n h√≥a Unicode
txt_1 = unicodedata.normalize('NFC', txt_1)

# Chuy·ªÉn v·ªÅ ch·ªØ th∆∞·ªùng
txt_1 = txt_1.lower()

# Lo·∫°i b·ªè kho·∫£ng tr·∫Øng d∆∞ th·ª´a
txt_1 = re.sub(r"\s+", " ", txt_1).strip()
print(txt_1)

v√† em lu√¥n tin sau c∆°n m∆∞a c·∫ßu v·ªìng s·∫Ω l·∫•p l√°nh.


### 3.	Lo·∫°i b·ªè stopwords v√† stemming/lemmatization
#### Lo·∫°i b·ªè stopwords

Th∆∞ vi·ªán nltk ch∆∞a h·ªó tr·ª£ ti·∫øng Vi·ªát

In [15]:
english_stopwords = stopwords.words('english')
print(english_stopwords)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [12]:
def load_vietnamese_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return set(line.strip().lower() for line in f if line.strip())

vietnamese_stopwords = load_vietnamese_stopwords("vietnamese-stopwords.txt")

In [18]:
text_stopwords = "Ta t·ª´ng b·∫Øt g·∫∑p nhau ·ªü kh·∫Øp S√†i G√≤n ch·∫Øc l√† l√∫c c√≤n y√™u d√π mu·ªën tr√°nh c≈©ng kh√≥"
# T√°ch t·ª´
tokens_stopwords = word_tokenize(text_stopwords, format="text").split()
# L·ªçc stopwords
filtered_tokens = [word for word in tokens_stopwords if word.lower() not in vietnamese_stopwords]
print(filtered_tokens)

['Ta', 'b·∫Øt_g·∫∑p', 'kh·∫Øp', 'S√†i_G√≤n', 'y√™u']


#### Stemming/ Lemmatization

#### Stemming

In [8]:
stemmer = PorterStemmer()
input_str = "There are several types of stemming algorithms."
input_str = nltk.word_tokenize(input_str)
for word in input_str:
    print(stemmer.stem(word))

there
are
sever
type
of
stem
algorithm
.


#### Lemmatization

In [12]:
lemmatizer = WordNetLemmatizer()
input_str = "been had done languages cities mice"
input_str = nltk.word_tokenize(input_str)

for word in input_str:
    print(lemmatizer.lemmatize(word))

been
had
done
language
city
mouse


### 4.	X√¢y d·ª±ng b·ªô t·ª´ ƒëi·ªÉn n-gram t·ª´ vƒÉn b·∫£n

In [12]:
def build_ngram(text, n=1):
    text
    tokens = word_tokenize(text.lower(), format="text").split()
    ngrams = defaultdict(int)
    for i in range (len(tokens) - n + 1):
        ngram = ' '.join(tokens[i:i+n])
        ngrams[ngram] += 1
    return dict(ngrams)

text = "Go, go, go, go! Don‚Äôt stop!"
bigram_dict = build_ngram(text, 1)

print(bigram_dict)

{'go': 4, ',': 3, '!': 2, 'don‚Äôt': 1, 'stop': 1}
