# Data Set Cleaning
> Author: Sharnique Beck

This notebook is used to clean scraped data for vectorizing

In [1]:
# Import libaries
import pandas as pd
import numpy as np
import re
import pickle


In [5]:
data = pd.read_csv('../data/show_info.csv')
data.tail(130)

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
494,16,"['1038g', '18g', '20g', '23g', '9g']",MBC,8.66,PG-13,No Limit,"['Lee Sang Yoon', 'Lee Yoon Ji', 'Jung Yunho',...",Park Sung-Soo,"Kim Sol-Ji, Kim Ye-Ri"
495,20,"['18g', '23g', '9g']",CHANNEL A,8.68,PG-13,Goodbye Dear Wife,"['Ryu Shi Won', 'Hong Soo Hyun', 'Kim Min Soo'...",Kim Pyung-Joong,Kim Do-Hyun
496,17,"['18g', '23g', '9g']",MBC,7.69,PG-13,Snowman,"['Kim Rae Won', 'Oh Yun Soo', 'Jo Jae Hyun', '...",Lee Chang Soon,Kim Do Woo
497,14,"['1037g', '18g', '23g', '9g']",MBC,7.48,PG-13,Next,"['Ryu Soo Young', 'Park Ye Jin', 'Lee Jong Soo...","Yu Jeong-Jun, Park Jae-Beom, Kim Do-Hoon","Joo Chan-Ok, Ku Seon-Kyeong, Ko Eun-Nim, Seo S..."
498,43,"['18g', '23g', '9g']",MBC,7.26,PG-13,Winter Bird,"['Lee Tae Gon', 'Hwang Jung Eum', 'Yoon Sang H...","Se-ho Jung, Joo Sung-Woo",Kum-ju Lee
499,16,"['18g', '23g', '6g', '9g']",MBC,8.85,PG-13,The Lawyers of the Great Republi...,"['Lee Sung Jae', 'Lee Soo Kyung', 'Ryu Soo You...",Yoon Jae-Moon,Seo Sook-Hyang
500,2,"['23g', '1038g', '1045g']",Naver TV,8.00,PG-13,Love Cells 2,"['Kim Yoo Jung', 'Im Seulong', 'Jo Bo Ah', 'Pa...",Kim Yong Wan,Kim Myung Hyun
501,20,"['9g', '23g', '7g', '1040g', '26g', '18g']",SBS,8.94,PG-13,Sign,"['Jung Gyu Woon', 'Kim Ah Joong', 'Jun Kwang R...",Jang Hang-Jun,Kim Eun-Hee
502,24,"['9g', '23g', '18g', '24g', '1041g']",KBS,8.63,PG-13,"Again, My Love","['Park Sang Won', 'Choi Myung Gil', 'Jung Gyu ...",Jong-chang Kim,Jo Hee
503,8,"['9g', '23g', '18g', '1038g', '25g', '6g', '19g']",KBS,9.46,PG-13,Girls’ Generation 1979,"['Bona', 'Chae Seo Jin', 'Lee Jong Hyun', 'Seo...",Hong Seok-Ku,"Kim Yong-Hee (novel), Yoon Kyung-Ah"


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 9 columns):
# episodes      624 non-null int64
genre           624 non-null object
network         595 non-null object
rating          624 non-null float64
s_rating        624 non-null object
title           624 non-null object
cast            624 non-null object
director        624 non-null object
screenwriter    612 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 44.0+ KB


## Clean Network info

In [7]:
data.loc[256,'network'] = 'MBC'
data.loc[272,'network'] = 'MBC'
data.loc[290,'network'] = 'KBS'
data.loc[300,'network'] = 'SBS'
data.loc[438,'network'] = 'MBC'
data.loc[520,'network'] = 'MBC'
data.loc[579,'network'] = 'MBC'

In [8]:
data['network'] = data['network'].map(lambda x: x.title().replace(' ','').replace(',',' ') if type(x)== str else ' ')

In [9]:
data['network'].isnull().sum()

0

In [10]:
data[data['network'] =='kbs ']

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter


In [11]:
data['network'].value_counts()

Mbc                         224
Sbs                         125
Kbs                         114
Tvn                          44
                             29
Ocn                          10
Jtbc                          9
Kbs2                          9
NaverTv                       9
ChannelA                      8
ArirangTv                     6
SbsPlus                       6
EChannel                      4
Mbn Dramax                    3
Mbn                           3
Mnet                          3
MbcMusic                      2
Oksusu                        2
SohuTv                        2
SbsMtv                        2
NaverTvCast ConversionTv      1
Xtvn                          1
Qtv                           1
Tcast                         1
KbsW                          1
O'Live                        1
Dramax Umax                   1
TvChosun                      1
Onstyle                       1
OnStyle                       1
Name: network, dtype: int64

## Genre Formatting

In [12]:
genres = pickle.load( open( "../data/genres.p", "rb" ) )

In [13]:
# Turn genre string into list
data['genre'] = data['genre'].map(lambda x: x[1:-1].replace("'","").split(', '))

In [14]:
# Pull genre titles and format
data['genre'] = data['genre'].map(lambda x: [genres.get(k)['name'] for k in x] if x != [''] else x)
data['genre'] = data['genre'].map(lambda x: [name.title().replace(' ','').replace(',',' ') for name in x])
data['genre'] = data['genre'].map(lambda x: ' '.join(x))

In [15]:
data.head()

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
0,32,KoreanDrama Drama Romance Comedy Crime&Mystery,Sbs,9.58,PG-13,Thirty But Seventeen,"['Shin Hye Sun', 'Yang Se Jong', 'Ahn Hyo Seop...",Jo Soo-Won,Jo Sung-Hee
1,36,Romance Melodrama KoreanDrama,Sbs,8.73,PG-13,Fates and Furies,"['Joo Sang Wook', 'Lee Min Jung', 'Lee Ki Woo'...",Jung Dong-Yoon,Kang Cheol-Woong
2,42,KoreanDrama Drama Romance,Sbs,9.45,PG-13,The Last Empress,"['Shin Sung Rok', 'Jang Nara', 'Choi Jin Hyuk'...",Joo Dong-Min,Kim Sun-Ok
3,16,Romance KoreanDrama,Tvn,9.56,PG-13,Encounter,"['Park Bo Gum', 'Song Hye Kyo', 'Jang Seung Jo...",Park Shin-Woo,Yoo Young-A
4,32,Romance Melodrama Drama KoreanDrama,Sbs,9.55,PG-13,My Strange Hero,"['Yoo Seung Ho', 'Jo Bo Ah', 'Kwak Dong Yeon',...",Ham Joon-Ho,Kim Yoon-Young


## Format Cast names

In [16]:
data['cast'] = data['cast'].map(lambda x: [name.replace(' ','') for name in x[1:-1].replace("'","").split(', ')])
data['cast'] = data['cast'].map(lambda x: ' '.join(x))

In [17]:
data.tail(130)

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
494,16,IdolDrama Romance Sports KoreanDrama Drama,Mbc,8.66,PG-13,No Limit,LeeSangYoon LeeYoonJi JungYunho GoAra,Park Sung-Soo,"Kim Sol-Ji, Kim Ye-Ri"
495,20,Romance KoreanDrama Drama,ChannelA,8.68,PG-13,Goodbye Dear Wife,RyuShiWon HongSooHyun KimMinSoo OhJooEun ParkJ...,Kim Pyung-Joong,Kim Do-Hyun
496,17,Romance KoreanDrama Drama,Mbc,7.69,PG-13,Snowman,KimRaeWon OhYunSoo JoJaeHyun GongHyoJin WangBi...,Lee Chang Soon,Kim Do Woo
497,14,Historical Romance KoreanDrama Drama,Mbc,7.48,PG-13,Next,RyuSooYoung ParkYeJin LeeJongSoo JangShinYoung,"Yu Jeong-Jun, Park Jae-Beom, Kim Do-Hoon","Joo Chan-Ok, Ku Seon-Kyeong, Ko Eun-Nim, Seo S..."
498,43,Romance KoreanDrama Drama,Mbc,7.26,PG-13,Winter Bird,LeeTaeGon HwangJungEum YoonSangHyun JangShinYo...,"Se-ho Jung, Joo Sung-Woo",Kum-ju Lee
499,16,Romance KoreanDrama Comedy Drama,Mbc,8.85,PG-13,The Lawyers of the Great Republi...,LeeSungJae LeeSooKyung RyuSooYoung HanDaGam Ka...,Yoon Jae-Moon,Seo Sook-Hyang
500,2,KoreanDrama IdolDrama WebDrama,NaverTv,8.00,PG-13,Love Cells 2,KimYooJung ImSeulong JoBoAh ParkEunJi ChoiYoun...,Kim Yong Wan,Kim Myung Hyun
501,20,Drama KoreanDrama Crime&Mystery MedicalDrama T...,Sbs,8.94,PG-13,Sign,JungGyuWoon KimAhJoong JunKwangRyul UhmJiWon P...,Jang Hang-Jun,Kim Eun-Hee
502,24,Drama KoreanDrama Romance Family&Kids Melodrama,Kbs,8.63,PG-13,"Again, My Love",ParkSangWon ChoiMyungGil JungGyuWoon JunInHwa ...,Jong-chang Kim,Jo Hee
503,8,Drama KoreanDrama Romance IdolDrama Costume&Pe...,Kbs,9.46,PG-13,Girls’ Generation 1979,Bona ChaeSeoJin LeeJongHyun SeoYoungJoo InGyoJ...,Hong Seok-Ku,"Kim Yong-Hee (novel), Yoon Kyung-Ah"


## Format Director and Writer names

In [20]:
data['director'] = data['director'].map(lambda x: x.replace('-',' ').replace(' ','').replace(',', ' '))

data['screenwriter'] = data['screenwriter'].map(lambda x: x.replace('-',' ').replace(' ','') if type(x) != float else '')

# remove parenthesis and their content
data['screenwriter'] = data['screenwriter'].map(lambda name: re.sub('\(.+?\)','',name).replace(',',' ') if type(name) != float else name )


In [21]:
data.tail(130)

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
494,16,IdolDrama Romance Sports KoreanDrama Drama,Mbc,8.66,PG-13,No Limit,LeeSangYoon LeeYoonJi JungYunho GoAra,ParkSungSoo,KimSolJi KimYeRi
495,20,Romance KoreanDrama Drama,ChannelA,8.68,PG-13,Goodbye Dear Wife,RyuShiWon HongSooHyun KimMinSoo OhJooEun ParkJ...,KimPyungJoong,KimDoHyun
496,17,Romance KoreanDrama Drama,Mbc,7.69,PG-13,Snowman,KimRaeWon OhYunSoo JoJaeHyun GongHyoJin WangBi...,LeeChangSoon,KimDoWoo
497,14,Historical Romance KoreanDrama Drama,Mbc,7.48,PG-13,Next,RyuSooYoung ParkYeJin LeeJongSoo JangShinYoung,YuJeongJun ParkJaeBeom KimDoHoon,JooChanOk KuSeonKyeong KoEunNim SeoSookHyang S...
498,43,Romance KoreanDrama Drama,Mbc,7.26,PG-13,Winter Bird,LeeTaeGon HwangJungEum YoonSangHyun JangShinYo...,SehoJung JooSungWoo,KumjuLee
499,16,Romance KoreanDrama Comedy Drama,Mbc,8.85,PG-13,The Lawyers of the Great Republi...,LeeSungJae LeeSooKyung RyuSooYoung HanDaGam Ka...,YoonJaeMoon,SeoSookHyang
500,2,KoreanDrama IdolDrama WebDrama,NaverTv,8.00,PG-13,Love Cells 2,KimYooJung ImSeulong JoBoAh ParkEunJi ChoiYoun...,KimYongWan,KimMyungHyun
501,20,Drama KoreanDrama Crime&Mystery MedicalDrama T...,Sbs,8.94,PG-13,Sign,JungGyuWoon KimAhJoong JunKwangRyul UhmJiWon P...,JangHangJun,KimEunHee
502,24,Drama KoreanDrama Romance Family&Kids Melodrama,Kbs,8.63,PG-13,"Again, My Love",ParkSangWon ChoiMyungGil JungGyuWoon JunInHwa ...,JongchangKim,JoHee
503,8,Drama KoreanDrama Romance IdolDrama Costume&Pe...,Kbs,9.46,PG-13,Girls’ Generation 1979,Bona ChaeSeoJin LeeJongHyun SeoYoungJoo InGyoJ...,HongSeokKu,KimYongHee YoonKyungAh


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 9 columns):
# episodes      624 non-null int64
genre           624 non-null object
network         624 non-null object
rating          624 non-null float64
s_rating        624 non-null object
title           624 non-null object
cast            624 non-null object
director        624 non-null object
screenwriter    624 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 44.0+ KB


In [23]:
data.to_csv('../data/clean_content.csv', index = False)

In [28]:
# Drop columns that I think have too many irrelevant similarities
data.drop(columns=['# episodes','network', 's_rating'], axis = 1, inplace=True)

In [29]:
data['content'] = data['genre']+' '+data['cast']+' '+data['director']+' '+data['screenwriter']


In [30]:
merged_data = pd.concat([pd.DataFrame(data['title']), pd.DataFrame(data['content'])], axis = 1)

In [31]:
merged_data.head()

Unnamed: 0,title,content
0,Thirty But Seventeen,KoreanDrama Drama Romance Comedy Crime&Mystery...
1,Fates and Furies,Romance Melodrama KoreanDrama JooSangWook LeeM...
2,The Last Empress,KoreanDrama Drama Romance ShinSungRok JangNara...
3,Encounter,Romance KoreanDrama ParkBoGum SongHyeKyo JangS...
4,My Strange Hero,Romance Melodrama Drama KoreanDrama YooSeungHo...


In [32]:
merged_data.to_csv('../data/merged_content.csv', index = False)