# ACTORS.CSV CLEANING

In [1]:
import pandas as pd

In [2]:
data_directory = 'data_csv/'

actors_df = pd.read_csv(data_directory + 'actors.csv')
actors_df

Unnamed: 0,id,name,role
0,1000001,Margot Robbie,Barbie
1,1000001,Ryan Gosling,Ken
2,1000001,America Ferrera,Gloria
3,1000001,Ariana Greenblatt,Sasha
4,1000001,Issa Rae,Barbie
...,...,...,...
5798445,1941596,Marc Ma,Ba Cai/巴莱
5798446,1941596,线雨轩,Tata/塔塔
5798447,1941596,Jiang Yixuan,Zuo Yila（Zoila）/佐伊拉
5798448,1941597,Hiroshi Mikami,


In [3]:
actors_df.shape

(5798450, 3)

In [4]:
actors_df.dtypes

id       int64
name    object
role    object
dtype: object

## CHANGING TYPES OF OBJECT COLUMNS

In [5]:
actors_df['name'] = actors_df['name'].astype('string')
actors_df['role'] = actors_df['role'].astype('string')
actors_df.dtypes

id               int64
name    string[python]
role    string[python]
dtype: object

## NOW WE TRY TO UNDERSTAND COLUMNS VALUES

In [6]:
column_values = list(actors_df.columns.values)
print(column_values)

['id', 'name', 'role']


In [7]:
# changing column names to make them more understandable
actors_df.rename(columns={'id': 'movie_id', 'name': 'actor_name'}, inplace=True)
print(list(actors_df.columns.values))

['movie_id', 'actor_name', 'role']


## ARE THERE NA VALUES IN THE DATASET?

In [8]:
not_available_ids = actors_df['movie_id'].isna().sum()
not_available_names = actors_df['actor_name'].isna().sum()
not_available_roles = actors_df['role'].isna().sum()
print('Not available ids: {}'.format(not_available_ids),
      '\nNot available names: {}'.format(not_available_names),
      '\nNot available roles: {}'.format(not_available_roles))

Not available ids: 0 
Not available names: 4 
Not available roles: 1361559


In [9]:
actors_df['role'] = actors_df['role'].fillna('Not available')

In [10]:
actors_df.loc[actors_df['actor_name'].isna()]

Unnamed: 0,movie_id,actor_name,role
4145738,1443629,,Not available
4281100,1469981,,Self
4306960,1474958,,Cinematography
5430275,1773264,,Not available


## ARE THERE ANY DUPLICATES?

In [11]:
num_duplicates = actors_df.duplicated().sum()
print('There are a total of {} duplicates.'.format(num_duplicates))

There are a total of 946 duplicates.


In [12]:
actors_df.loc[actors_df.duplicated(keep=False)]

Unnamed: 0,movie_id,actor_name,role
3967,1000062,Rosie Jones,Lady of the Boot of Jemiah
3993,1000062,Rosie Jones,Lady of the Boot of Jemiah
44615,1000797,Karel Heřmánek,Czech Injured Man
44642,1000797,Karel Heřmánek,Czech Injured Man
47806,1000863,Michael Fennimore,Car Salesman
...,...,...,...
5791299,1937512,David Livet,Not available
5792882,1939290,Gudni Oddgeirsson,Interviewee
5792884,1939290,Gudni Oddgeirsson,Interviewee
5795227,1940468,Ann Victorin,other voices


In [13]:
# removing duplicates
actors_df = actors_df[~actors_df.duplicated()]
duplicated_actors = actors_df.loc[actors_df.duplicated()]

if duplicated_actors.empty:
    print('There are no more duplicate in the dataset.')
else:
    print(duplicated_actors)

There are no more duplicate in the dataset.


## ARE THERE ANY UNREASONABLE VALUES?
some values in the actors_name columns have spaces and wrong characters

In [14]:
actors_df.sort_values(by='actor_name').head(20)

Unnamed: 0,movie_id,actor_name,role
3043656,1256548,Corrado Fortuna,Nicola
4543435,1524731,Corrado Fortuna,Not available
1170298,1057564,Cirilo Fernández,Himself
2131754,1143850,Douglas Hegdahl,Not available
1760441,1106725,Jack Phelan,Not available
4050827,1425504,Kazi Khaliq,Not available
3992650,1414882,Kevin Hainey,Kevin
4522983,1520004,Misbah,Not available
3294098,1294506,Najma,Not available
3809642,1381567,Ng Gam-Hung,Not available


In [15]:
actors_df.loc[:, 'actor_name'] = (actors_df['actor_name']
                                  .str.lstrip()
                                  .str.lstrip('!')
                                  .replace(u'\uFEFF', '', regex=True))
actors_df.sort_values(by='actor_name').head(10)

Unnamed: 0,movie_id,actor_name,role
4913032,1614735,"""Aidan""",Self
5339023,1741632,"""Big"" Sam Amoakoatta","""Big"" Sam Amoakoatta"
5783045,1933304,"""Blasé""",Le Vendeur Juif
3478038,1324505,"""Bowpicker"" Bob Merel",Self
5628501,1849655,"""CJ Chris James",Self
4468023,1508108,"""Canario""",Bruixot
5379996,1755671,"""Chino"" Gómez",Vicente Gauna
3454554,1320755,"""Crazy"" Bob Cook",Himself
5003283,1639658,"""Critical"" Bob Boes",Shed Owner
3412955,1313949,"""Cuiri"" Cristino Ramírez",Not available


## SAVING THE CLEANED DATASET

In [16]:
actors_df.to_csv('data_cleaned/actors_cleaned.csv', index=False)

In [17]:
actors_df = None