In [172]:
import pandas as pd
import numpy as np
import re

In [173]:
df = pd.read_csv('./data/koreagram_2021_2022.csv', encoding='utf-8-sig')

In [174]:
df['content']

0       I put a lot less pressure on myself this time ...
1                                               운치 있었던 찻집
2       ??????.......#southkorea #구례여행 #nature #southk...
3       Today felt like summer ?? but I'm so ready for...
4       I made a valiant effort, but alas I only made ...
                              ...                        
5266    2016. Сокчхо, Южная Корея2016. Sokcho, South K...
5267    ???? by @ginabearsblog?Happy New Year everyone...
5268    Happy New Year to all my friends around the wo...
5269    ?What are your resolutions for 2021???????Emai...
5270    What were some of your resolutions this year? ...
Name: content, Length: 5271, dtype: object

In [175]:
# 정규표현식 사용 # 제외한 특수문자 제거
df['content']= df['content'].str.replace(pat=r'[^A-Za-z0-9가-힣#]',repl=r' ',regex=True)
df['content']

0       I put a lot less pressure on myself this time ...
1                                               운치 있었던 찻집
2                    #southkorea #구례여행 #nature #southk...
3       Today felt like summer    but I m so ready for...
4       I made a valiant effort  but alas I only made ...
                              ...                        
5266    2016                     2016  Sokcho  South K...
5267         by  ginabearsblog Happy New Year everyone...
5268    Happy New Year to all my friends around the wo...
5269     What are your resolutions for 2021       Emai...
5270    What were some of your resolutions this year  ...
Name: content, Length: 5271, dtype: object

In [176]:
# 중복제거
df.drop_duplicates(subset=['content'] , inplace=True)
df = df.reset_index(drop=True)
df

Unnamed: 0,content,date,like,place,tags,네이버위치명,경도,위도,count
0,I put a lot less pressure on myself this time ...,2022/09/07,0,KIAF 국제아트페어,"['#artweek', '#전시', '#seoul', '#contemporaryar...",,,,2.0
1,운치 있었던 찻집,2022/09/07,24,,[],,,,
2,#southkorea #구례여행 #nature #southk...,2022/09/07,56,전라남도 구례,"['#southkorea', '#구례여행', '#nature', '#southkor...",노고단,127.532100,35.293675,4.0
3,Today felt like summer but I m so ready for...,2022/09/07,66,전라남도 구례,"['#구례여행', '#southkorea', '#wanderlust', '#trav...",노고단,127.532100,35.293675,4.0
4,I made a valiant effort but alas I only made ...,2022/09/07,33,KIAF 국제아트페어,"['#coex.', '#frieze', '#kiaf', '#artweek', '#전...",,,,2.0
...,...,...,...,...,...,...,...,...,...
4883,2016 2016 Sokcho South K...,2021/01/01,41,강원도 속초,"['#изархива', '#сокчхо', '#сокчо', '#канвондо'...",속초아이,128.602690,38.190739,8.0
4884,by ginabearsblog Happy New Year everyone...,2021/01/01,222,"서울스카이 Seoul Sky, Lotte World Tower, Korea","['#seoul', '#songpa.??', '#korea_attractions',...",,,,16.0
4885,Happy New Year to all my friends around the wo...,2021/01/01,244,,[],,,,
4886,What are your resolutions for 2021 Emai...,2021/01/01,47,,"['#christmas', '#merrychristmas', '#holiday', ...",,,,


In [177]:
# 앞 공백 제거
df['content'] = df['content'].str.lstrip()

In [178]:
# 대상여부 컬럼 추가
df['대상여부'] = 'Y'
df

Unnamed: 0,content,date,like,place,tags,네이버위치명,경도,위도,count,대상여부
0,I put a lot less pressure on myself this time ...,2022/09/07,0,KIAF 국제아트페어,"['#artweek', '#전시', '#seoul', '#contemporaryar...",,,,2.0,Y
1,운치 있었던 찻집,2022/09/07,24,,[],,,,,Y
2,#southkorea #구례여행 #nature #southkorea #wanderl...,2022/09/07,56,전라남도 구례,"['#southkorea', '#구례여행', '#nature', '#southkor...",노고단,127.532100,35.293675,4.0,Y
3,Today felt like summer but I m so ready for...,2022/09/07,66,전라남도 구례,"['#구례여행', '#southkorea', '#wanderlust', '#trav...",노고단,127.532100,35.293675,4.0,Y
4,I made a valiant effort but alas I only made ...,2022/09/07,33,KIAF 국제아트페어,"['#coex.', '#frieze', '#kiaf', '#artweek', '#전...",,,,2.0,Y
...,...,...,...,...,...,...,...,...,...,...
4883,2016 2016 Sokcho South K...,2021/01/01,41,강원도 속초,"['#изархива', '#сокчхо', '#сокчо', '#канвондо'...",속초아이,128.602690,38.190739,8.0,Y
4884,by ginabearsblog Happy New Year everyone It ...,2021/01/01,222,"서울스카이 Seoul Sky, Lotte World Tower, Korea","['#seoul', '#songpa.??', '#korea_attractions',...",,,,16.0,Y
4885,Happy New Year to all my friends around the wo...,2021/01/01,244,,[],,,,,Y
4886,What are your resolutions for 2021 Email...,2021/01/01,47,,"['#christmas', '#merrychristmas', '#holiday', ...",,,,,Y


In [179]:
# 한글로 시작하면 'N'
korean = re.compile(r'[ㄱ-ㅣ가-힣]')
for i in range(len(df['content'])):
    if korean.match(df.loc[i, 'content']):
        df.loc[i, '대상여부'] = 'N'

In [180]:
df['대상여부'].value_counts()

Y    4464
N     424
Name: 대상여부, dtype: int64

In [181]:
df.to_csv('./data/koreagram_2021_2022_clean.csv', index=False, encoding='utf-8-sig')