In [3]:
from konlpy.tag import Mecab
import pandas as pd

In [10]:
# Preprocessing code with Mecab
#dicpath="/usr/local/lib/mecab/dic/mecab-ko-dic"
mecab = Mecab(r"C:/mecab/mecab-ko-dic") # Mecab User Dic Path

def preprocessing_mecab(readData):
    
    #### Clean text
    sentence = CleanText(readData)
    
    #### Tokenize
    morphs = mecab.pos(sentence)
    
    JOSA = ["JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC"] # 조사
    SIGN = ["SF", "SE", "SSO", "SSC", "SC", "SY"] # 문장 부호
    TERMINATION = ["EP", "EF", "EC", "ETN", "ETM"] # 어미
    SUPPORT_VERB = ["VX"] # 보조 용언
    NUMBER = ["SN"]
    
    # Remove JOSA, EOMI, etc
    morphs[:] = (morph for morph in morphs if morph[1] not in JOSA+SIGN+TERMINATION+SUPPORT_VERB)
        
    # Remove length-1 words  
    morphs[:] = (morph for morph in morphs if not (len(morph[0]) == 1))
    
    # Remove Numbers
    morphs[:] = (morph for morph in morphs if morph[1] not in NUMBER)
   
    # Result pop-up
    result = []
    for morph in morphs:
        result.append(morph[0])
        
    return result

In [4]:
import re

# Basic Cleaning Text Function
def CleanText(readData, Num=False, Eng=False):

    # Remove Retweets RT @user_screenname 
    text = re.sub('RT @[\w_]+: ', '', readData)
    text = re.sub('알티', '', text)

    # Remove Mentions
    text = re.sub('@[\w_]+', '', text)

    # Remove or Replace URL 
    text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", ' ', text) # http로 시작되는 url
    text = re.sub(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", ' ', text) # http로 시작되지 않는 url
    
    # Remove Hashtag
    #text = re.sub('[#]+[0-9a-zA-Z_]+', ' ', text)

    # Remove Garbage Words (ex. &lt, &gt, etc)
    text = re.sub('[&]+[a-z]+', ' ', text)

    # Remove Special Characters
    text = re.sub('[^#0-9a-zA-Zㄱ-ㅎ가-힣]', ' ', text)
    
    # Remove newline
    text = text.replace('\n',' ')
    
    if Num is True:
        # Remove Numbers
        text = re.sub(r'\d+',' ',text)
    
    if Eng is True:
        # Remove English 
        text = re.sub('[a-zA-Z]' , ' ', text)

    # Remove multi spacing & Reform sentence
    text = ' '.join(text.split())
       
    return text

In [32]:
df1 = pd.read_csv('D:/Code/Presidential_Election_Analysis/Crawling/ㅇㅈㅁ_2022-02-09~2022-02-16.csv',encoding='utf-8')
df2 = pd.read_csv('D:/Code/Presidential_Election_Analysis/Crawling/ㅇㅈㅁ_2022-02-16~2022-02-23.csv',encoding='utf-8')
df3 = pd.read_csv('D:/Code/Presidential_Election_Analysis/Crawling/ㅇㅈㅁ_2022-02-23~2022-03-02.csv',encoding='utf-8')
df4 = pd.read_csv('D:/Code/Presidential_Election_Analysis/Crawling/ㅇㅈㅁ_2022-03-02~2022-03-09.csv',encoding='utf-8')

In [33]:
dfs = [df1, df2, df3, df4]

In [34]:
for i in dfs:
    i['Text'] = i['Text'].apply(lambda x : CleanText(x))


In [35]:
since_save = ['2022-02-09', '2022-02-16', '2022-02-23', '2022-03-02']
until_save = ['2022-02-16', '2022-02-23', '2022-03-02', '2022-03-09']

In [36]:
for s in range(len(dfs)):
    #tweets_df2_[s] = pd.DataFrame(tweets_list[s], columns=['Datetime', 'Tweet Id', 'Text', 'Username'])
    dfs[s].to_csv('./ㅇㅈㅁ_'+since_save[s]+'~'+until_save[s]+'_cleaned.csv', encoding='utf-8-sig',sep=',', index=False)
    

In [8]:
df1.head(5)

Unnamed: 0,Datetime,Tweet Id,Text,Username
0,2022-02-09,1491562480681852938,윤석열은 이렇게 말해라 적폐청산이라기 보다는 죄가 있으면 법대로 처벌한다,Ksm1005Sangmook
1,2022-02-09,1491562459194265602,대전MBC 충청권 윤석열41 7 이재명31 4 안철수8 4 10 3 p 앞서,_polinews
2,2022-02-09,1491562445906669570,으흐흐 나 이런 댕댕이야,snanjing
3,2022-02-09,1491562420459827202,여보세요 이재명 아니면 왜 윤석열 일거라 생각하죠 둘 다 싫다고욧 나 참,tzsaxo
4,2022-02-09,1491562299227738115,언론이 편파적이어도 어쩜 이렇게 편파적일수 있냐고 윤석열이 대통령되면 어떤 세상이 ...,pearl_ssc
