In [1]:
# -*- coding: utf-8 -*-
from underthesea import word_tokenize
import re
import VietnameseTextNormalizer.ReleasePython3.VietnameseTextNormalizer as nm
import pandas as pd
import numpy as np

In [2]:
def removeConsecutiveDuplicates(S): 
    S = list(S.rstrip()) 
    
    n = len(S)  
    
    if (n < 2) : 
        return S[0]
           
    j = 0
       
    for i in range(n):  
        if (S[j] != S[i]): 
            j += 1
            S[j] = S[i]  
      
    j += 1
    S = S[:j]
    return "".join(S)

In [3]:
def preprocessing(s, show_stepbystep=False, remove_stop_words=True):
    if show_stepbystep:
        print("original:")
        print(s)
        print()
    
    # remove 'Xem th√™m'
    s = re.sub('Xem th√™m', '', s)
    if show_stepbystep:
        print("remove xt:")
        print(s)
        print()

    # convert to lower case
    s = s.lower()
    if show_stepbystep:
        print("lowercase:")
        print(s)
        print()
    
    # normalize Vietnamese
    s = nm.ASRNormalize(s)
    if show_stepbystep:
        print("normalized Vietnamese:")
        print(s)
        print()
        
    # remove urls and hashtags
    s = re.sub(r'http\S+', '', s)
    s = re.sub(r'#\S+', '', s)
    if show_stepbystep:
        print('remove urls and hashtags:')
        print(s)
        print()
    
    # remove email address
    s = re.sub(r'\S*@\S*\s?', '', s)
    if show_stepbystep:
        print('remove email addresses:')
        print(s)
        print()
    
    # split into words
    tokens = word_tokenize(s)
    if show_stepbystep:
        print('tokenize:')
        print(tokens)
        print()
    
    # remove punctuation and number
    words = [word for word in tokens if re.sub(r"\s+", "", word).isalpha()]
    if show_stepbystep:
        print('remove punctuation:')
        print(words)
        print()
    
    # remove stop words
    if remove_stop_words:
        words = [word for word in words if not word in stop_words]
        if show_stepbystep:
            print('remove stop words:')
            print(words)
            print()
    
    # remove consecutive duplicates character
    words = [removeConsecutiveDuplicates(word) for word in words]
    if show_stepbystep:
        print('remove consecutive duplicates character:')
        print(words)
        print()
        
    # remove single character
    words = [word for word in words if len(word)>1]
    if show_stepbystep:
        print('remove single character:')
        print(words)
        print()
    
    return words

In [4]:
# load stop words list
file = open('vietnamese-stopwords.txt')
stop_words = file.readlines()
file.close()

stop_words = set([word.strip('\n') for word in stop_words])
stop_words

{'sau',
 't·ªët b·ªô',
 'khi n√†o',
 'l·∫•y s·ªë',
 'nh∆∞ nhau',
 'cao rƒÉng',
 'h·ªó tr·ª£',
 'g·∫ßn b√™n',
 'nh√† vi·ªác',
 's·ªõm',
 'bi·∫øt m√¨nh',
 'nghen',
 's·ªü dƒ©',
 'ƒë·∫∑t ra',
 'vi·ªác',
 'tƒÉng th√™m',
 'ng·ªçn ngu·ªìn',
 'th·∫ø l·∫°i',
 'tr·ªáu tr·∫°o',
 'theo tin',
 't·ª´ lo·∫°i',
 'gi·ªØ l·∫•y',
 'm·ªõi hay',
 'l√† √≠t',
 'ƒë∆∞a v·ªÅ',
 'c√≥ ch·ª©',
 '·ªü nh·ªù',
 'chung',
 'ch√πn ch√πn',
 't√¥i con',
 'b·∫•y l√¢u nay',
 'h·ªèi',
 'ƒÉn l√†m',
 'l·ªõn l√™n',
 'xƒÉm x·∫Øm',
 'c√¢y n∆∞·ªõc',
 'c√≥ th·∫ø',
 'h·∫øt √Ω',
 't√≠nh c√°ch',
 'qua',
 'ngu·ªìn',
 't·∫•n',
 'ch√≠nh th·ªã',
 'd·∫°',
 'ch∆∞a c·∫ßn',
 'tu·ªët tu·ªôt',
 'c√≤n',
 'ba b·∫£n',
 'l·∫°i ƒÉn',
 '·∫•y',
 'c·∫£m ∆°n',
 'nh·ªõ',
 'n·ªØa l√†',
 'l·∫°i ƒë√¢y',
 'ngo√†i n√†y',
 'kh√¥ng c√≥ g√¨',
 'c·∫£ ƒÉn',
 'ƒÉn ng·ªìi',
 'v·ªü',
 'qu√° nhi·ªÅu',
 'nghƒ© ra',
 'vi·ªác g√¨',
 'mu·ªën',
 'ph√≠a tr∆∞·ªõc',
 '√°i d√†',
 'mang n·∫∑ng',
 'th∆∞·ªùng th√¥i',
 'cu c·∫≠u',
 'lu√¥n',
 'l·∫ßn n√†o',
 'ph·ªèng t√≠nh',
 'th√†

In [5]:
k10 = pd.read_csv("k10/posts_1594272491_8.csv", header=None)
k11 = pd.read_csv("k11/posts_1594251010_7.csv", header=None)
k12 = pd.read_csv("k12/posts_1594208046_4.csv", header=None)
k13 = pd.read_csv("k13/posts_1594221437_4.csv", header=None)
k14 = pd.read_csv("k14/posts_1594216235_5.csv", header=None)

In [6]:
dataset = pd.concat([k10, k11, k12, k13, k14], ignore_index=True)

In [7]:
dataset

Unnamed: 0,0
0,[TSSƒêH]-TH√îNG B√ÅO T·ªî CH·ª®C L·ªöP √îN T·∫¨P CHU·∫®N B·ªä ...
1,üíóüíó Giao l∆∞u c√πng NhaÃÄ baÃÅo- NhaÃÄ thoÃõ Nguy·ªÖn P...
2,Ph√≤ng C√¥ng t√°c Sinh vi√™n th√¥ng tin ƒë·∫øn sinh vi...
3,üî¨ C√°c chuy√™n ng√†nh ƒë√†o t·∫°o tr∆∞·ªùng ƒê·∫°i h·ªçc Kans...
4,üëâ B·∫≠t m√≠ v·ªõi c√°c b·∫°n m·ªôt ch∆∞∆°ng tr√¨nh v√¥ c√πng ...
...,...
13149,B·∫°n n√†y v·ª´a ƒë·∫øn Tr∆∞·ªùng n·ªôp Gi·∫•y k·∫øt qu·∫£ thi TH...
13150,C√°c em nh·ªè nh·ªõ ƒëi MHX2020 nha\nCh·ªã share ƒë·ªÉ ki...
13151,# ch√†o m√¨nh ƒë·ªó ktpm clc ... b√°c n√†o chung khoa...
13152,"Ch√†o c√°c b·∫°n, ph√≤ng C√¥ng t√°c Sinh vi√™n ƒë√£ ƒëƒÉng..."


In [8]:
dataset[1] = [preprocessing(dataset.iloc[i][0]) for i in range(dataset.shape[0])]

In [9]:
dataset[2] = [' '.join(dataset.iloc[i][1]) for i in range(dataset.shape[0])]

In [10]:
dataset

Unnamed: 0,0,1,2
0,[TSSƒêH]-TH√îNG B√ÅO T·ªî CH·ª®C L·ªöP √îN T·∫¨P CHU·∫®N B·ªä ...,"[tsƒëh, th√¥ng b√°o, t·ªï ch·ª©c, l·ªõp, √¥n t·∫≠p, tuy·ªÉn ...",tsƒëh th√¥ng b√°o t·ªï ch·ª©c l·ªõp √¥n t·∫≠p tuy·ªÉn sinh c...
1,üíóüíó Giao l∆∞u c√πng NhaÃÄ baÃÅo- NhaÃÄ thoÃõ Nguy·ªÖn P...,"[giao l∆∞u, nh√† b√°o, nh√† th∆°, nguy·ªÖn phong vi·ªát...",giao l∆∞u nh√† b√°o nh√† th∆° nguy·ªÖn phong vi·ªát ph√≤...
2,Ph√≤ng C√¥ng t√°c Sinh vi√™n th√¥ng tin ƒë·∫øn sinh vi...,"[ph√≤ng, c√¥ng t√°c, sinh vi√™n, th√¥ng tin, sinh v...",ph√≤ng c√¥ng t√°c sinh vi√™n th√¥ng tin sinh vi√™n l...
3,üî¨ C√°c chuy√™n ng√†nh ƒë√†o t·∫°o tr∆∞·ªùng ƒê·∫°i h·ªçc Kans...,"[chuy√™n ng√†nh, ƒë√†o t·∫°o, tr∆∞·ªùng, ƒë·∫°i h·ªçc, kansa...",chuy√™n ng√†nh ƒë√†o t·∫°o tr∆∞·ªùng ƒë·∫°i h·ªçc kansai osa...
4,üëâ B·∫≠t m√≠ v·ªõi c√°c b·∫°n m·ªôt ch∆∞∆°ng tr√¨nh v√¥ c√πng ...,"[b·∫≠t m√≠, ch∆∞∆°ng tr√¨nh, v√¥ c√πng, h·∫•p d·∫´n, di·ªÖn,...",b·∫≠t m√≠ ch∆∞∆°ng tr√¨nh v√¥ c√πng h·∫•p d·∫´n di·ªÖn nh√† v...
...,...,...,...
13149,B·∫°n n√†y v·ª´a ƒë·∫øn Tr∆∞·ªùng n·ªôp Gi·∫•y k·∫øt qu·∫£ thi TH...,"[tr∆∞·ªùng, n·ªôp, gi·∫•y, k·∫øt qu·∫£, thi, thpt, qu·ªëc g...",tr∆∞·ªùng n·ªôp gi·∫•y k·∫øt qu·∫£ thi thpt qu·ªëc gia x√°c ...
13150,C√°c em nh·ªè nh·ªõ ƒëi MHX2020 nha\nCh·ªã share ƒë·ªÉ ki...,"[ƒëi, nha, share, ki·ªÉu, khai tr∆∞∆°ng, group, sha...",ƒëi nha share ki·ªÉu khai tr∆∞∆°ng group share m·∫•y ...
13151,# ch√†o m√¨nh ƒë·ªó ktpm clc ... b√°c n√†o chung khoa...,"[ch√†o, ƒë·ªó, ktpm, clc, khoa, l√†m quen, mail, ed...",ch√†o ƒë·ªó ktpm clc khoa l√†m quen mail edu tr∆∞·ªùng...
13152,"Ch√†o c√°c b·∫°n, ph√≤ng C√¥ng t√°c Sinh vi√™n ƒë√£ ƒëƒÉng...","[ch√†o, ph√≤ng, c√¥ng t√°c, sinh vi√™n, ƒëƒÉng t·∫£i, h...",ch√†o ph√≤ng c√¥ng t√°c sinh vi√™n ƒëƒÉng t·∫£i h∆∞·ªõng d...


In [11]:
dataset = dataset.drop_duplicates(ignore_index=True, subset=2)
dataset.shape

(9141, 3)

In [12]:
np.where(dataset.applymap(lambda x: x == ''))

(array([61]), array([2]))

In [13]:
dataset.iloc[61]

0    https://forms.gle/PgdEEVyyeeSjczSy7
1                                     []
2                                       
Name: 61, dtype: object

In [14]:
dataset = dataset.drop([61])
dataset.shape

(9140, 3)

In [15]:
np.where(dataset.applymap(lambda x: x == ''))

(array([], dtype=int64), array([], dtype=int64))

In [16]:
dataset.isna().sum()

0    0
1    0
2    0
dtype: int64

In [17]:
dataset.reset_index(drop=True, inplace=True)

In [18]:
dataset

Unnamed: 0,0,1,2
0,[TSSƒêH]-TH√îNG B√ÅO T·ªî CH·ª®C L·ªöP √îN T·∫¨P CHU·∫®N B·ªä ...,"[tsƒëh, th√¥ng b√°o, t·ªï ch·ª©c, l·ªõp, √¥n t·∫≠p, tuy·ªÉn ...",tsƒëh th√¥ng b√°o t·ªï ch·ª©c l·ªõp √¥n t·∫≠p tuy·ªÉn sinh c...
1,üíóüíó Giao l∆∞u c√πng NhaÃÄ baÃÅo- NhaÃÄ thoÃõ Nguy·ªÖn P...,"[giao l∆∞u, nh√† b√°o, nh√† th∆°, nguy·ªÖn phong vi·ªát...",giao l∆∞u nh√† b√°o nh√† th∆° nguy·ªÖn phong vi·ªát ph√≤...
2,Ph√≤ng C√¥ng t√°c Sinh vi√™n th√¥ng tin ƒë·∫øn sinh vi...,"[ph√≤ng, c√¥ng t√°c, sinh vi√™n, th√¥ng tin, sinh v...",ph√≤ng c√¥ng t√°c sinh vi√™n th√¥ng tin sinh vi√™n l...
3,üî¨ C√°c chuy√™n ng√†nh ƒë√†o t·∫°o tr∆∞·ªùng ƒê·∫°i h·ªçc Kans...,"[chuy√™n ng√†nh, ƒë√†o t·∫°o, tr∆∞·ªùng, ƒë·∫°i h·ªçc, kansa...",chuy√™n ng√†nh ƒë√†o t·∫°o tr∆∞·ªùng ƒë·∫°i h·ªçc kansai osa...
4,üëâ B·∫≠t m√≠ v·ªõi c√°c b·∫°n m·ªôt ch∆∞∆°ng tr√¨nh v√¥ c√πng ...,"[b·∫≠t m√≠, ch∆∞∆°ng tr√¨nh, v√¥ c√πng, h·∫•p d·∫´n, di·ªÖn,...",b·∫≠t m√≠ ch∆∞∆°ng tr√¨nh v√¥ c√πng h·∫•p d·∫´n di·ªÖn nh√† v...
...,...,...,...
9135,B·∫°n n√†y v·ª´a ƒë·∫øn Tr∆∞·ªùng n·ªôp Gi·∫•y k·∫øt qu·∫£ thi TH...,"[tr∆∞·ªùng, n·ªôp, gi·∫•y, k·∫øt qu·∫£, thi, thpt, qu·ªëc g...",tr∆∞·ªùng n·ªôp gi·∫•y k·∫øt qu·∫£ thi thpt qu·ªëc gia x√°c ...
9136,C√°c em nh·ªè nh·ªõ ƒëi MHX2020 nha\nCh·ªã share ƒë·ªÉ ki...,"[ƒëi, nha, share, ki·ªÉu, khai tr∆∞∆°ng, group, sha...",ƒëi nha share ki·ªÉu khai tr∆∞∆°ng group share m·∫•y ...
9137,# ch√†o m√¨nh ƒë·ªó ktpm clc ... b√°c n√†o chung khoa...,"[ch√†o, ƒë·ªó, ktpm, clc, khoa, l√†m quen, mail, ed...",ch√†o ƒë·ªó ktpm clc khoa l√†m quen mail edu tr∆∞·ªùng...
9138,"Ch√†o c√°c b·∫°n, ph√≤ng C√¥ng t√°c Sinh vi√™n ƒë√£ ƒëƒÉng...","[ch√†o, ph√≤ng, c√¥ng t√°c, sinh vi√™n, ƒëƒÉng t·∫£i, h...",ch√†o ph√≤ng c√¥ng t√°c sinh vi√™n ƒëƒÉng t·∫£i h∆∞·ªõng d...


In [19]:
dataset.to_csv('dataset.csv', index=False)