# Data Set Cleaning
> Author: Sharnique Beck

This notebook is used to clean scraped data for vectorizing

In [1]:
# Import libaries
import pandas as pd
import numpy as np
import re
import pickle


In [2]:
data = pd.read_csv('../data/show_info.csv')
data.head()

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
0,32,"['23g', '9g', '18g', '6g', '7g']",SBS,9.58,PG-13,Thirty But Seventeen,"['Shin Hye Sun', 'Yang Se Jong', 'Ahn Hyo Seop...",Jo Soo-Won,Jo Sung-Hee
1,36,"['18g', '1041g', '23g']",SBS,8.73,PG-13,Fates and Furies,"['Joo Sang Wook', 'Lee Min Jung', 'Lee Ki Woo'...",Jung Dong-Yoon,Kang Cheol-Woong
2,42,"['23g', '9g', '18g']",SBS,9.45,PG-13,The Last Empress,"['Shin Sung Rok', 'Jang Nara', 'Choi Jin Hyuk'...",Joo Dong-Min,Kim Sun-Ok
3,16,"['18g', '23g']",tvN,9.56,PG-13,Encounter,"['Park Bo Gum', 'Song Hye Kyo', 'Jang Seung Jo...",Park Shin-Woo,Yoo Young-A
4,32,"['18g', '1041g', '9g', '23g']",SBS,9.55,PG-13,My Strange Hero,"['Yoo Seung Ho', 'Jo Bo Ah', 'Kwak Dong Yeon',...",Ham Joon-Ho,Kim Yoon-Young


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 9 columns):
# episodes      624 non-null int64
genre           624 non-null object
network         624 non-null object
rating          624 non-null float64
s_rating        624 non-null object
title           624 non-null object
cast            624 non-null object
director        624 non-null object
screenwriter    624 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 44.0+ KB


## Clean Network info

In [4]:
data.loc[256,'network'] = 'MBC'
data.loc[272,'network'] = 'MBC'
data.loc[290,'network'] = 'KBS'
data.loc[300,'network'] = 'SBS'
data.loc[438,'network'] = 'MBC'
data.loc[520,'network'] = 'MBC'
data.loc[579,'network'] = 'MBC'

In [27]:
data['network'] = data['network'].map(lambda x: x.title().replace(' ','').replace(',',' ') if type(x)== str else ' ')

In [24]:
data['network'].isnull().sum()

0

In [7]:
data[data['network'] =='kbs ']

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter


In [8]:
data['network'].value_counts()

Mbc                         224
Sbs                         125
Kbs                         114
Tvn                          44
Ocn                          10
Kbs2                          9
NaverTv                       9
Jtbc                          9
ChannelA                      8
ArirangTv                     6
SbsPlus                       6
EChannel                      4
Mbn Dramax                    3
Mbn                           3
Mnet                          3
SbsMtv                        2
Oksusu                        2
MbcMusic                      2
SohuTv                        2
NaverTvCast ConversionTv      1
KbsW                          1
Onstyle                       1
TvChosun                      1
Tcast                         1
O'Live                        1
Qtv                           1
Xtvn                          1
Dramax Umax                   1
OnStyle                       1
Name: network, dtype: int64

## Genre Formatting

In [9]:
genres = pickle.load( open( "../data/genres.p", "rb" ) )

In [10]:
# Turn genre string into list
data['genre'] = data['genre'].map(lambda x: x[1:-1].replace("'","").split(', '))

In [11]:
# Pull genre titles and format
data['genre'] = data['genre'].map(lambda x: [genres.get(k)['name'] for k in x] if x != [''] else x)
data['genre'] = data['genre'].map(lambda x: [name.title().replace(' ','').replace(',',' ') for name in x])
data['genre'] = data['genre'].map(lambda x: ' '.join(x))

In [12]:
data.head()

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
0,32,KoreanDrama Drama Romance Comedy Crime&Mystery,Sbs,9.58,PG-13,Thirty But Seventeen,"['Shin Hye Sun', 'Yang Se Jong', 'Ahn Hyo Seop...",Jo Soo-Won,Jo Sung-Hee
1,36,Romance Melodrama KoreanDrama,Sbs,8.73,PG-13,Fates and Furies,"['Joo Sang Wook', 'Lee Min Jung', 'Lee Ki Woo'...",Jung Dong-Yoon,Kang Cheol-Woong
2,42,KoreanDrama Drama Romance,Sbs,9.45,PG-13,The Last Empress,"['Shin Sung Rok', 'Jang Nara', 'Choi Jin Hyuk'...",Joo Dong-Min,Kim Sun-Ok
3,16,Romance KoreanDrama,Tvn,9.56,PG-13,Encounter,"['Park Bo Gum', 'Song Hye Kyo', 'Jang Seung Jo...",Park Shin-Woo,Yoo Young-A
4,32,Romance Melodrama Drama KoreanDrama,Sbs,9.55,PG-13,My Strange Hero,"['Yoo Seung Ho', 'Jo Bo Ah', 'Kwak Dong Yeon',...",Ham Joon-Ho,Kim Yoon-Young


## Format Cast names

In [13]:
data['cast'] = data['cast'].map(lambda x: [name.replace(' ','') for name in x[1:-1].replace("'","").split(', ')])
data['cast'] = data['cast'].map(lambda x: ' '.join(x))

In [14]:
data.head()

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast,director,screenwriter
0,32,KoreanDrama Drama Romance Comedy Crime&Mystery,Sbs,9.58,PG-13,Thirty But Seventeen,ShinHyeSun YangSeJong AhnHyoSeop YeJiWon JungY...,Jo Soo-Won,Jo Sung-Hee
1,36,Romance Melodrama KoreanDrama,Sbs,8.73,PG-13,Fates and Furies,JooSangWook LeeMinJung LeeKiWoo SoYiHyun ParkS...,Jung Dong-Yoon,Kang Cheol-Woong
2,42,KoreanDrama Drama Romance,Sbs,9.45,PG-13,The Last Empress,ShinSungRok JangNara ChoiJinHyuk ShinEunKyung ...,Joo Dong-Min,Kim Sun-Ok
3,16,Romance KoreanDrama,Tvn,9.56,PG-13,Encounter,ParkBoGum SongHyeKyo JangSeungJo P.O NamGiAe S...,Park Shin-Woo,Yoo Young-A
4,32,Romance Melodrama Drama KoreanDrama,Sbs,9.55,PG-13,My Strange Hero,YooSeungHo JoBoAh KwakDongYeon YooSeonHo KimDo...,Ham Joon-Ho,Kim Yoon-Young


## Format Director and Writer names

In [28]:
data['director'] = data['director'].map(lambda x: x.replace('-',' ').replace(' ','').replace(',', ' '))

data['screenwriter'] = data['screenwriter'].map(lambda x: x.replace('-',' ').replace(' ','') if type(x) != float else '')

# remove parenthesis and their content
data['screenwriter'] = data['screenwriter'].map(lambda name: re.sub('\(.+?\)','',name).replace(',',' ') if type(name) != float else name )


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 9 columns):
# episodes      624 non-null int64
genre           624 non-null object
network         624 non-null object
rating          624 non-null float64
s_rating        624 non-null object
title           624 non-null object
cast            624 non-null object
director        624 non-null object
screenwriter    624 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 44.0+ KB


In [18]:
data.to_csv('../data/clean_content.csv', index = False)

In [37]:
data['content'] = data['# episodes'].astype(str) +' '+data['genre']+' '+data['network'] +' '+ data['s_rating']+' '+data['cast']+' '+data['director']+' '+data['screenwriter']

In [39]:
merged_data = pd.concat([pd.DataFrame(data['title']), pd.DataFrame(data['content'])], axis = 1)

In [40]:
merged_data.head()

Unnamed: 0,title,content
0,Thirty But Seventeen,32 KoreanDrama Drama Romance Comedy Crime&Myst...
1,Fates and Furies,36 Romance Melodrama KoreanDrama Sbs PG-13 Joo...
2,The Last Empress,42 KoreanDrama Drama Romance Sbs PG-13 ShinSun...
3,Encounter,16 Romance KoreanDrama Tvn PG-13 ParkBoGum Son...
4,My Strange Hero,32 Romance Melodrama Drama KoreanDrama Sbs PG-...


In [41]:
merged_data.to_csv('../data/merged_content.csv', index = False)