# Data Set Cleaning
> Author: Sharnique Beck

This notebook is used to clean scraped data for vectorizing

In [9]:
# Import libaries
import pandas as pd
import numpy as np
import re
import pickle


In [2]:
data = pd.read_csv('../data/show_info.csv')
data.head()

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
0,32,"['23g', '9g', '18g', '6g', '7g']",SBS,9.58,PG-13,Thirty But Seventeen,"['Shin Hye Sun', 'Yang Se Jong', 'Ahn Hyo Seop...",Jo Soo-Won,Jo Sung-Hee
1,36,"['18g', '1041g', '23g']",SBS,8.73,PG-13,Fates and Furies,"['Joo Sang Wook', 'Lee Min Jung', 'Lee Ki Woo'...",Jung Dong-Yoon,Kang Cheol-Woong
2,42,"['23g', '9g', '18g']",SBS,9.45,PG-13,The Last Empress,"['Shin Sung Rok', 'Jang Nara', 'Choi Jin Hyuk'...",Joo Dong-Min,Kim Sun-Ok
3,16,"['18g', '23g']",tvN,9.56,PG-13,Encounter,"['Park Bo Gum', 'Song Hye Kyo', 'Jang Seung Jo...",Park Shin-Woo,Yoo Young-A
4,32,"['18g', '1041g', '9g', '23g']",SBS,9.55,PG-13,My Strange Hero,"['Yoo Seung Ho', 'Jo Bo Ah', 'Kwak Dong Yeon',...",Ham Joon-Ho,Kim Yoon-Young


## Clean Network info

In [3]:
data.loc[256,'network'] = 'MBC'
data.loc[272,'network'] = 'MBC'
data.loc[290,'network'] = 'KBS'
data.loc[300,'network'] = 'SBS'
data.loc[438,'network'] = 'MBC'
data.loc[520,'network'] = 'MBC'
data.loc[579,'network'] = 'MBC'

In [4]:
data['network'] = data['network'].map(lambda x: x.lower() if type(x)== str else x)

In [5]:
data['network'].value_counts()

mbc                             224
sbs                             125
kbs                             114
tvn                              44
ocn                              10
naver tv                          9
kbs2                              9
channel a                         8
jtbc                              7
sbs plus                          6
arirang tv                        6
e channel                         4
mbn                               3
mnet                              3
mbn, dramax                       3
sohu tv                           2
oksusu                            2
mbc music                         2
jtbc                              2
sbs mtv                           2
xtvn                              1
on style                          1
kbs w                             1
naver tv cast, conversion tv      1
tv chosun                         1
dramax, umax                      1
o'live                            1
tcast                       

In [None]:
data['network'].isnull().sum()

In [None]:
data[data['network'] =='kbs ']

In [None]:
data['network'].value_counts()

## Genre Formatting

In [118]:
genres = pickle.load( open( "../data/genres.p", "rb" ) )

In [97]:
# Turn genre string into list
data['genre'] = data['genre'].map(lambda x: x[1:-1].replace("'","").split(', '))

In [116]:
# Pull genre titles
data['genre'] = data['genre'].map(lambda x: [genres.get(k)['name'] for k in x] if x != [''] else x)

In [117]:
data.head()

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
0,32,"[Korean Drama, Drama, Romance, Comedy, Crime &...",sbs,9.58,PG-13,Thirty But Seventeen,"['Shin Hye Sun', 'Yang Se Jong', 'Ahn Hyo Seop...",Jo Soo-Won,Jo Sung-Hee
1,36,"[Romance, Melodrama, Korean Drama]",sbs,8.73,PG-13,Fates and Furies,"['Joo Sang Wook', 'Lee Min Jung', 'Lee Ki Woo'...",Jung Dong-Yoon,Kang Cheol-Woong
2,42,"[Korean Drama, Drama, Romance]",sbs,9.45,PG-13,The Last Empress,"['Shin Sung Rok', 'Jang Nara', 'Choi Jin Hyuk'...",Joo Dong-Min,Kim Sun-Ok
3,16,"[Romance, Korean Drama]",tvn,9.56,PG-13,Encounter,"['Park Bo Gum', 'Song Hye Kyo', 'Jang Seung Jo...",Park Shin-Woo,Yoo Young-A
4,32,"[Romance, Melodrama, Drama, Korean Drama]",sbs,9.55,PG-13,My Strange Hero,"['Yoo Seung Ho', 'Jo Bo Ah', 'Kwak Dong Yeon',...",Ham Joon-Ho,Kim Yoon-Young


## Format Cast names

In [152]:
data['cast'] = data['cast'].map(lambda x: [name.replace(' ','') for name in x[1:-1].replace("'","").split(', ')])

In [163]:
data.head()

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
0,32,"[Korean Drama, Drama, Romance, Comedy, Crime &...",sbs,9.58,PG-13,Thirty But Seventeen,"[ShinHyeSun, YangSeJong, AhnHyoSeop, YeJiWon, ...",Jo Soo-Won,Jo Sung-Hee
1,36,"[Romance, Melodrama, Korean Drama]",sbs,8.73,PG-13,Fates and Furies,"[JooSangWook, LeeMinJung, LeeKiWoo, SoYiHyun, ...",Jung Dong-Yoon,Kang Cheol-Woong
2,42,"[Korean Drama, Drama, Romance]",sbs,9.45,PG-13,The Last Empress,"[ShinSungRok, JangNara, ChoiJinHyuk, ShinEunKy...",Joo Dong-Min,Kim Sun-Ok
3,16,"[Romance, Korean Drama]",tvn,9.56,PG-13,Encounter,"[ParkBoGum, SongHyeKyo, JangSeungJo, P.O, NamG...",Park Shin-Woo,Yoo Young-A
4,32,"[Romance, Melodrama, Drama, Korean Drama]",sbs,9.55,PG-13,My Strange Hero,"[YooSeungHo, JoBoAh, KwakDongYeon, YooSeonHo, ...",Ham Joon-Ho,Kim Yoon-Young


## Format Director and Writer names

In [263]:
data['director'] = data['director'].map(lambda x: x.replace('-',' ').replace(' ','').replace(',', ' '))

In [174]:
data['screenwriter'] = data['screenwriter'].map(lambda x: x.replace('-',' ').replace(' ','') if type(x) != float else x)

In [261]:
# remove parenthesis and their content
data['screenwriter'] = data['screenwriter'].map(lambda name: re.sub('\(.+?\)','',name).replace(',',' ') if type(name) != float else name )


In [264]:
data.head(10)

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
0,32,"[Korean Drama, Drama, Romance, Comedy, Crime &...",sbs,9.58,PG-13,Thirty But Seventeen,"[ShinHyeSun, YangSeJong, AhnHyoSeop, YeJiWon, ...",JoSooWon,JoSungHee
1,36,"[Romance, Melodrama, Korean Drama]",sbs,8.73,PG-13,Fates and Furies,"[JooSangWook, LeeMinJung, LeeKiWoo, SoYiHyun, ...",JungDongYoon,KangCheolWoong
2,42,"[Korean Drama, Drama, Romance]",sbs,9.45,PG-13,The Last Empress,"[ShinSungRok, JangNara, ChoiJinHyuk, ShinEunKy...",JooDongMin,KimSunOk
3,16,"[Romance, Korean Drama]",tvn,9.56,PG-13,Encounter,"[ParkBoGum, SongHyeKyo, JangSeungJo, P.O, NamG...",ParkShinWoo,YooYoungA
4,32,"[Romance, Melodrama, Drama, Korean Drama]",sbs,9.55,PG-13,My Strange Hero,"[YooSeungHo, JoBoAh, KwakDongYeon, YooSeonHo, ...",HamJoonHo,KimYoonYoung
5,16,"[Korean Drama, Idol Drama, Romance]",tvn,9.72,PG-13,What’s Wrong With Secretary Kim,"[ParkSeoJoon, ParkMinYoung, LeeTaeHwan, KangKi...",ParkJoonHwa YooJongSun,JungKyungYoon JungEunYoung
6,16,"[Romance, Korean Drama]","mbn, dramax",9.15,PG-13,Devilish Joy,"[ChoiJinHyuk, SongHaYoon, LeeHoWon, LeeJooYeon]",KimGaRam,ChoiJiYeon
7,33,"[Drama, Korean Drama, Comedy, Romance]",mbc,9.67,PG-13,I Am Not a Robot,"[YooSeungHo, ChaeSooBin, UhmKiJoon, HwangSeung...",JungDaeYoon,KimSeonMi
8,40,"[Korean Drama, Drama, Romance, Comedy, Thrille...",sbs,9.63,PG-13,Suspicious Partner,"[JiChangWook, NamJiHyun, ChoiTaeJoon, Nara, Ki...",ParkSunHo,KwonKiYoung
9,16,"[Romance, Sports, Drama, Korean Drama]",mbc,9.75,PG-13,Weightlifting Fairy Kim Bok Joo,"[NamJooHyuk, LeeSungKyung, LeeJaeYoon, KyungSo...",OhHyunJong,YangHeeSeung


In [253]:
[re.sub('\(.+?\)','*',name) for name in data['cast'][463]]

['EricMun', 'JungYuMi*', 'LeeKyuHan', 'YoonJiHye']

In [259]:
data['screenwriter'][37]

'YoonYiSoo,kk,KimMinJung,ImYeJin'