# Extract email rows from news sentences.
# Extract warning rows from news sentences.

In [1]:
import json
import pandas as pd
import re

In [2]:
with open('./data/news/201901/sentences.json') as f:
    sentences = json.load(f)

In [3]:
print(len(sentences))
print(type(sentences))

48966
<class 'list'>


In [4]:
df = pd.DataFrame.from_dict(sentences)
df.head(30)

Unnamed: 0,pid,sentence,type
0,b0d99f623a483d40b8c4c7fd008fbcff,(양양=뉴스1) 서근영 기자 = 1일 강원도 양양군 서면 송천리 일대 야산에서 발...,normal
1,b0d99f623a483d40b8c4c7fd008fbcff,sky4018@news1.kr,email
2,b0d99f623a483d40b8c4c7fd008fbcff,"<저작권자 © 뉴스1코리아, 무단전재 및 재배포 금지>",normal
3,7d3ae92ebccb7228a58f7e308401010f,(양양=뉴스1) 서근영 기자 = 1일 강원도 양양군 서면 송천리 일대 야산에서 발...,normal
4,7d3ae92ebccb7228a58f7e308401010f,sky4018@news1.kr,email
5,7d3ae92ebccb7228a58f7e308401010f,"<저작권자 © 뉴스1코리아, 무단전재 및 재배포 금지>",normal
6,539767076c4d117c6306bdf7f006e1c8,(양양=뉴스1) 고재교 기자 = 1일 강원도 양양군 서면 송천리 일대 야산에서 발...,normal
7,539767076c4d117c6306bdf7f006e1c8,high15@news1.kr,email
8,539767076c4d117c6306bdf7f006e1c8,"<저작권자 © 뉴스1코리아, 무단전재 및 재배포 금지>",normal
9,3e1654f8ca75d2f19ada15b57a4747db,(양양=뉴스1) 고재교 기자 = 1일 강원도 양양군 서면 송천리 일대 야산에서 발...,normal


In [48]:
# declare regular expression
email_check = re.compile('^[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')
round_bracket = re.compile('\([^)]*\)')
square_bracket = re.compile('\[[^)]*\]')
angle_bracket = re.compile('\<[^)]*\>')
single_quatation = re.compile('\'[^)]*\'')
double_quatation = re.compile('\"[^)]*\"')
str_check = re.compile('[^0-9a-zA-Zㄱ-힗]')

re_filters = [email_check, round_bracket, square_bracket, angle_bracket, single_quatation, double_quatation, str_check]

In [20]:
text = '"test" test'
result = double_quatation.findall(text)
result

'"test"'

In [49]:
# filtering 
"""
1. 문장 전체가 싱글/더블 quatation으로 쌓여진 문장 찾아서 제거
2. 문장 전체가 대괄호/중괄호로 쌓여진 문장 찾아서 제거
- 문장 내에 존재하는 것들을 제거할 것인가? 고민해보자
"""

for idx, s in enumerate(sentences): 
    text = s['sentence'].strip()
    if len(text) == 0:
        sentences[idx]['type'] = 'none'
        continue
    
    # empty  sentence
    if (text == '' or len(text) < 15):
        sentences[idx]['type'] = 'none'
        continue
    
    # warning sentence    
    if ('무단전재 및 재배포 금지' in text) == True:
        sentences[idx]['type'] = 'warning'
        continue
    
    # quotation
    if re_filters[4].fullmatch(text) or re_filters[5].fullmatch(text):
        text = text[1:-1]
    
    # email sentence
    token = text.split(' ')
    for t in token:
        check = re_filters[0].match(t.strip('[](),'))
        if check != None:
            sentences[idx]['type'] = 'email'
            break
    if sentences[idx]['type'] == 'email':
        continue
    
    # bracket full match
    if re_filters[1].match(text) or re_filters[2].match(text) or re_filters[3].match(text):
        sentences[idx]['type'] = 'bracket'
        continue
    
    # extract 
    text2 = text.split(' = ')
    if len(text2) > 1:
        sentences[idx]['sentence'] = text2[1]
    else:
        sentences[idx]['sentence'] = text
    
    if re_filters[6].match(sentences[idx]['sentence'][0]) != None:
        sentences[idx]['type'] = 'extra'
        continue
    
    sentences[idx]['type'] = 'normal'

In [51]:
# extract not email
sentences_normal = list(filter(lambda stc:stc['type'] == 'normal', sentences))
sentences_extra = list(filter(lambda stc:stc['type'] == 'extra', sentences))
sentences_bracket = list(filter(lambda stc:stc['type'] == 'bracket', sentences))
sentences_none = list(filter(lambda stc:stc['type'] == 'none', sentences))
sentences_warn = list(filter(lambda stc:stc['type'] == 'warning', sentences))
sentences_email = list(filter(lambda stc:stc['type'] == 'email', sentences))

In [52]:
# 37861
print(len(sentences_normal))

# 1382
print(len(sentences_extra))

# 1798
print(len(sentences_bracket))

# 2470
print(len(sentences_none))

# 873 
print(len(sentences_warn))

# 4582
print(len(sentences_email))

37861
1382
1798
2470
873
4582


In [None]:
# 502c80f96a0b44dd84a9581c6bcffcf0 / a5216e0834740a1b6be20a392a528e30 / aa583a688beeb7ec6ce4d484f1f1c5ca
# e9341587b10c0cb28c4aa4ccaad10944 / 1df30f2c433fa9491e7fd8d8972819d4 / f5fb4e59c162a8b4adfabe3c9745b05f
# 2dd92e7dba8b2e44cf35b15bf70901f0 / 0d2d495f0aa3903ed09f765dfb60c85d / a0fd930cea2666de2623b0f9586100dd
# 3c876c23863da3151536b9a21e98464f / f24a4af5791fa55db3acedab450bf1d8 / 461297b28c65ac19a4ace48dad5c3ab8
# 668e65bea1e610c8445a46a24c528fbf / 7ce8e32ec8a9a571dd3d25940c91458e / b9d04be32d8ea5a272a0a128ea311dbe
# 7da8a3eb29441f579a0d7083f3628d8d / c64bed8028bda69453871b8ea681bb8e / 50c6544481ee834a60391b16b5b18b57
# 02932f1436cd8f8628c03b7e2d1d4bdb / 15c3340a66356cde5b167460dc98ebde / a376d3bcd6d2aa721a742be05668e231
# 7565abe83a08550687c1bb428aa63801 / fbb89185ced618cb4cc33ec51efd87a2 / 6d437ff269df5f787aef91d70e8c0f38
# 1ee93495ff36718e0121ae8bc583d02e / c7b2b91b02a2eb7adf7dec7fd2721b9a / 7d3780d2ace9e566a59ba4141a181e08
# cf877710bef93d858e259514ae62bbca / 28dd284ad26d1da601a6b5e78c9fc7ed / e6dac9e501efa2b7bcccb9f1042f8a7c
# fff39f9f10f127f29ab4fcc5812cd948 / 204a430938e607eaaecaf1620477b4c1 / a9338e16c91ae57fb1ed7966f69a1b84
# 573391debc8b6d04032c68fd4e96077d / 1dbb4973e77d5751e844b87db0eb624f / 82ade7d1b07256c6fe6b02b5788ecf48
# ebdc474722475259885f8d6ded61839e / 3cf1d206d57d6adf14bca034b92215da / 73df6c30f8eb54d6396459d6a3666dfc
# e2c82f754cad10f7088d7439f460acc2 / 5a07304724e5a8312e4f143a56cf0ee1 / 5ba380994a642edf372e53c9ca5468aa
# 281c9f274525eb0d0cf2c1c5aa85fad8 / a3e6b15454a5b1fef11b92d0111ce238 / c389a3248f23046f50919b541fa60d91
# 78a36e5d7394881e523d736952fda659 / e164bb1cdd3dbb142b0a0c51f198e7eb / aa2c0511c8d2ba33401507c1ea3493e9
# 73e577c0b3eae68c1beb939850af2a26 / 205423a4c36a75903d03383895a4cc83 / 966e5f2320d5e28426950ece0df425c7
# 4c25fc2e23c5aa2360c182ad4e2119ff / f888aa95bba4b11c65c03f0c8fd7a12e / c97bb721b7207e11d3404312ffa947e5
# 30a67a99fd2efb36d67b743dca009f5f / 30a67a99fd2efb36d67b743dca009f5f / 0cd7d57eca63a457a2b30787bfbaf35f
# e676423569b8562ee825abbaa88feef7 / b9ef07e5fdf02cb42314fb8504a1ac9a / f5536d253ebb74c21a2b98e93ab96854
# 8b39085a91e7812681d4e8f9ebcb527a / 7f4071c07bccc479dce317d9eef0660a / acc258dd84bdcc4ddec1bc0a82d7d1a8
# 2b514804aed6adda1b5fdf1e1f9253c4 / 2e0139984685915fd8712effce1fc0fd / 6a0d4453a6def2b16b422b665a108118
# 74d96b34441b5296bc01eeae5844eb06 / 465133171ce9113f404f5752b070e2dc / a382967040e035ef0ce2f15cba0ef664
# 070224cfa1b84e088141d934aced6cf1 / 6f9ba40475cd9337a2e576b62ff79bc4 / 008039ffb605d5c4c8b8466f75e05839
# 66195724300b5387018132fcc2ad640b / d267e43dc9594c2ef2c94d00d850a068 / 77e46ffc81f5a50288260dd24f04454a
# 68a14b8b939d58b2c2da8a66085db915 / efc80367b391fa8d1af34270b1a234fa / 6ba599312f3ede253aaf569ed22fa38c
# d65b5344ff628ea4c97f04f21a46d55d / 76b985a7cae6d811f5e1858179c740c8 / 70a7e8a010c026fd060fe74fcd32c482
# 2b381e68906e0f514dda9441485828ba / 96830139c4167bb5f3dd8abda7be07c7 / df5d01371337da4cfbb3091c08d8cc71
# 57d375c2e8d1c29e7a7e01bf911d697d / 714ff8207c265ab39ef83773bbc9154c / 5988841299504ddda9d752f995b182a2
# d637cb23b161373ce2a89c5c2f00f24a / b66c8b75e8dbf07075d802c1e77831c2 / 5c25093aa2c96eb2cb6da7329d2771b6
# 70b81fa2fb955210820e6e3e5f02c51a / 04e1d6ac9c65b51b050c08e9d3564dd1 / 21ef453143cdcbafb084d125fa491105
# 360c5e6aff755f4166969b89e730bcf1 / 69ac9b3075868c3c6f07dd925b5e7f31 / 6e6ae35799c6b8d9204aa1206790df1f
# 9861ff62f9fcca0e1adb81d0d5b67276 / 6885d5ecb594d6126b2e0964440fb4c0 / dfaf602bf5cde7eb7b9e8928ce05e86d
# 675ac6465e8654437829309835394a3b / 22036e715e70f033073855cb126d612d / 4b7c8eec0405841ddb893f81081ee8b1
# 859e777e653e95cc0b8b0d0dc83ae11d / b4b5856b28257b821a05499b19649eb7 / 4df10a78494b38bdcbff160ccc14b82d
# 516cb8c6c2120c521e52e9d83af495a3 / eaabf00ffbc84c974e82465032431bdb / a0e595bde0f6d207a34fff1d19f1c9f5
# 304cc780cfeacbfdbef31f6d30ff7278 / 5805a2ce45e13a6a28365a378661d34b / e8282bfaf0c8d8d0849ccdb4fd7b783d
# 4475cce384c23d0d3005d8417c39e86f / 3b7786c298493d2fbd061584ca008ed5 / a200443b8e29139882fc64eeb712247b
# c082486d2cf5a5166cccc5a5f83e59f7 / 54b75e09ebef9c03e237e1462da7578a / 163e71977ad3eadd21caebd172b2d88f
# ac4f96a1ee56cb376e33310342c7a185 / 7526f02d69a73a6f5bbb0f7695bcb776 / fa9c308cd84fe0dfc474a74907348237
# 10c42320f35a154bcae7982dfad0665a / 345863b2659a6bfbdd6fc4a8c4e00989 / b8b2d23262f3f339e0a8b5997bbffde8
# 59fb1040b647d37195255254dabb0920 / 360be099f5d7b742ab67abac1b1af455 / 973e7ab5f4ef3f9fbdc0687c60d1d5d5
# 0be6cdf7e82d646ccddb591550a7eecb / fc6e5b12c7406e6f62492ce97716052a / b00614038d462ebf0c80e6111261e139
# cfe0b936583ecced32fcbae0697208a4 / 81eabbd71b46c0ac707c083d5aa9278b / b371b7bff4f30790139ef195ab7263fb
# 33d5b92f36ec758c582ba1b1a4bb6989 / f48fd618cf6864d912dc22d54b1593ae / 7207c5be36854cd0ed9b21816e6b36b9
# f4698bdf5c876499f2460181337fa249 / 8ea5bee858a763ccb202181a3741ebef / 184d7db700a24d5e0949ded0ec04d89f
# e580f98ae9a9ec568f203c80ec49a918 / 80f3e1c873cb0efa0737082b1d4facea / 095b9f567da16e144e5030a07e7caac4
# 3811ad052fe6a4ec225be1bd28ec5b18 / b33c6ac6a4e95a1ff13e94b8f76da153 / 5f9618dc86ec66a34eaa86c7dc668031
# b973feae35c97dc35d4c7a8a0ab05cb8 / 
n = 0
for s in sentences_email:
    n += 1
    print(n, ': ',s['pid'], '>>>',  s['sentence'])

In [53]:
sentences_normal = list(filter(lambda stc:stc['type'] == 'normal', sentences))

In [54]:
with open('./data/news/201901/sentences_normal.json', 'w', encoding='utf-8') as make_file:
    json.dump(sentences_normal, make_file, indent="\n")