# CREW.CSV CLEANING

In [1]:
import pandas as pd

In [2]:
data_directory = 'data_csv/'

crew_df = pd.read_csv(data_directory + 'crew.csv')
crew_df

Unnamed: 0,id,role,name
0,1000001,Director,Greta Gerwig
1,1000001,Producer,Tom Ackerley
2,1000001,Producer,Margot Robbie
3,1000001,Producer,Robbie Brenner
4,1000001,Producer,David Heyman
...,...,...,...
4720178,1941596,Casting,线雨轩
4720179,1941596,Editor,Eric Kwong Chi-Leung
4720180,1941596,Cinematography,Kenny Tse
4720181,1941596,Composer,胡小欧


In [3]:
crew_df.shape

(4720183, 3)

In [4]:
crew_df.dtypes


id       int64
role    object
name    object
dtype: object

## CHANGING TYPES OF OBJECT COLUMNS

In [5]:
crew_df['role'] = crew_df['role'].astype('string')
crew_df['name'] = crew_df['name'].astype('string')
crew_df.dtypes

id               int64
role    string[python]
name    string[python]
dtype: object

## NOW WE TRY TO UNDERSTAND COLUMNS VALUES

In [6]:
# could we try to fix that nan value?

In [7]:
print(list(crew_df.columns))

['id', 'role', 'name']


In [8]:
crew_df.rename(columns={'id':'movie_id', 'name':'crew_member_name'}, inplace=True)
print(list(crew_df.columns.values))

['movie_id', 'role', 'crew_member_name']


In [9]:
list_of_roles = list(crew_df['role'].unique())
print(list_of_roles)

['Director', 'Producer', 'Writer', 'Casting', 'Editor', 'Cinematography', 'Assistant director', 'Additional directing', 'Executive producer', 'Lighting', 'Camera operator', 'Additional photography', 'Production design', 'Art direction', 'Set decoration', 'Special effects', 'Visual effects', 'Stunts', 'Choreography', 'Composer', 'Songs', 'Sound', 'Costume design', 'Makeup', 'Hairstyling', 'Story', 'Original writer', 'Title design', 'Co-director']


In [10]:
crew_df['crew_member_name'] = crew_df['crew_member_name'].str.lstrip()
sorted_names = crew_df.sort_values(by='crew_member_name')
sorted_names.head()

Unnamed: 0,movie_id,role,crew_member_name
3797842,1582829,Cinematography,"""Chaco"" Ramirez"
4119968,1688178,Stunts,"""Cornet"""
885141,1039214,Writer,"""Dhinanthorum"" Nagarajan"
3607495,1525355,Sound,"""Doc"" Pierce"
4119963,1688178,Writer,"""Eira"""


## ARE THERE NA VALUES IN THE DATASET?

In [11]:
not_available_movie_ids = crew_df['movie_id'].isna().sum()
not_available_roles = crew_df['role'].isna().sum()
not_available_member_names = crew_df['crew_member_name'].isna().sum()
print('Not available movie ids: {}'.format(not_available_movie_ids),
      '\nNot available roles: {}'.format(not_available_roles),
      '\nNot available crew members: {}'.format(not_available_member_names))

Not available movie ids: 0 
Not available roles: 0 
Not available crew members: 1


since there's just one NA value, we could try to fix it

In [12]:
na_crew_member = crew_df.loc[crew_df['crew_member_name'].isna()]
na_crew_member # not enough information

Unnamed: 0,movie_id,role,crew_member_name
4562126,1859397,Writer,


## ARE THERE ANY DUPLICATES?

In [13]:
num_duplicates_crew = crew_df.duplicated().sum()
print('There are a total of {} duplicates.'.format(num_duplicates_crew))

There are a total of 1282 duplicates.


In [14]:
crew_df.loc[crew_df.duplicated(keep=False)]

Unnamed: 0,movie_id,role,crew_member_name
1680,1000018,Stunts,Chris Webb
1721,1000018,Stunts,Chris Webb
2690,1000031,Stunts,Sarah Irwin
2691,1000031,Stunts,Sarah Irwin
2692,1000031,Stunts,Sarah Irwin
...,...,...,...
4718342,1940904,Assistant director,Choe Yeong-sik
4719627,1941357,Executive producer,Josh Earl
4719628,1941357,Executive producer,Josh Earl
4720108,1941521,Sound,Oscar van Hoogevest


In [15]:
# removing duplicates
crew_df = crew_df[~crew_df.duplicated()]

crew_df.loc[crew_df.duplicated()]

Unnamed: 0,movie_id,role,crew_member_name


In [16]:
crew_df.to_csv('data_cleaned/crew_cleaned.csv', index=False)