# STUDIOS.CSV CLEANING

In [15]:
import pandas as pd

In [16]:
data_directory = 'data_csv/'

studio_df = pd.read_csv(data_directory + 'studios.csv')
studio_df

Unnamed: 0,id,studio
0,1000001,LuckyChap Entertainment
1,1000001,Heyday Films
2,1000001,NB/GG Pictures
3,1000001,Mattel
4,1000001,Warner Bros. Pictures
...,...,...
679278,1941596,上海猫眼影业有限公司
679279,1941596,坏小子（北京）传媒有限公司
679280,1941596,亚太国影（重庆）文化传媒有限公司
679281,1941596,凤凰传奇影业有限公司


In [17]:
studio_df.shape

(679283, 2)

In [18]:
studio_df.dtypes

id         int64
studio    object
dtype: object

## CHANGING TYPES OF OBJECT COLUMNS

In [19]:
studio_df['studio'] = studio_df['studio'].astype('string')
studio_df.dtypes

id                 int64
studio    string[python]
dtype: object

## NOW WE TRY TO UNDERSTAND COLUMNS VALUES

In [20]:
columns_values = list(studio_df.columns.values)
print(columns_values)

['id', 'studio']


In [7]:
studio_df.rename(columns={'id': 'movie_id'}, inplace=True)
columns_values = list(studio_df.columns.values)
print(columns_values)

['movie_id', 'studio']


## ARE THERE NA VALUES IN THE DATASET?

In [8]:
not_available_ids = studio_df['movie_id'].isna().sum()
not_available_studios = studio_df['studio'].isna().sum()
print('Not available ids: {}'.format(not_available_ids),
      '\nNot available names: {}'.format(not_available_studios))

Not available ids: 0 
Not available names: 10


## ARE THERE ANY DUPLICATES?


In [9]:
num_duplicated_studios = studio_df.duplicated().sum()
print('There are a total of {} duplicates '.format(num_duplicated_studios))

There are a total of 212 duplicates 


In [10]:
studio_df.loc[studio_df.duplicated(keep=False)]

Unnamed: 0,movie_id,studio
145,1000044,Working Title Films
146,1000044,Working Title Films
485,1000165,Working Title Films
487,1000165,Working Title Films
809,1000263,Working Title Films
...,...,...
656807,1863229,Star Media
665452,1888903,Deutsche Film- und Fernsehakademie Berlin (DFFB)
665454,1888903,Deutsche Film- und Fernsehakademie Berlin (DFFB)
677099,1934998,Ministerstvo kultury ČR


In [11]:
#removing duplicates
studio_df = studio_df[~studio_df.duplicated()]

studio_df.loc[studio_df.duplicated()]

Unnamed: 0,movie_id,studio


## SAVING THE CLEANED DATASET

In [12]:
studio_df.to_csv('data_cleaned/studios_cleaned.csv', index=False)

In [13]:
studio_df = None