# THE_OSCAR_AWARDS.CSV CLEANING

In [1]:
import pandas as pd
import re

In [2]:
data_directory = 'data_csv/'

oscar_awards_df = pd.read_csv(data_directory + 'the_oscar_awards.csv')
oscar_awards_df

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False
...,...,...,...,...,...,...,...
10884,2023,2024,96,WRITING (Original Screenplay),Written by Celine Song,Past Lives,False
10885,2023,2024,96,JEAN HERSHOLT HUMANITARIAN AWARD,,,True
10886,2023,2024,96,HONORARY AWARD,"To Angela Bassett, who has inspired audiences ...",,True
10887,2023,2024,96,HONORARY AWARD,"To Mel Brooks, for his comedic brilliance, pro...",,True


In [3]:
oscar_awards_df.shape

(10889, 7)

In [4]:
oscar_awards_df.dtypes

year_film         int64
year_ceremony     int64
ceremony          int64
category         object
name             object
film             object
winner             bool
dtype: object

## CHANGING TYPES OF OBJECT COLUMNS

In [5]:
oscar_awards_df['year_film'] = oscar_awards_df['year_film'].astype(int)
oscar_awards_df['year_ceremony'] = oscar_awards_df['year_ceremony'].astype(int)
oscar_awards_df['category'] = oscar_awards_df['category'].astype('string')
oscar_awards_df['name'] = oscar_awards_df['name'].astype('string')
oscar_awards_df['film'] = oscar_awards_df['film'].astype('string')
oscar_awards_df.dtypes

year_film                 int64
year_ceremony             int64
ceremony                  int64
category         string[python]
name             string[python]
film             string[python]
winner                     bool
dtype: object

## NOW WE TRY TO UNDERSTAND COLUMNS VALUES

In [6]:
columns_values = list(oscar_awards_df.columns.values)
print(columns_values)

['year_film', 'year_ceremony', 'ceremony', 'category', 'name', 'film', 'winner']


In [7]:
# we might change 'film' with 'movie' to be cohesive with the rest of the dataset terms
oscar_awards_df.rename(columns={'year_film': 'year_movie', 'film': 'movie'}, inplace=True)
print(columns_values)

['year_film', 'year_ceremony', 'ceremony', 'category', 'name', 'film', 'winner']


In [8]:
list_of_categories = list(oscar_awards_df['category'].unique())
print(list_of_categories)

['ACTOR', 'ACTRESS', 'ART DIRECTION', 'CINEMATOGRAPHY', 'DIRECTING (Comedy Picture)', 'DIRECTING (Dramatic Picture)', 'ENGINEERING EFFECTS', 'OUTSTANDING PICTURE', 'UNIQUE AND ARTISTIC PICTURE', 'WRITING (Adaptation)', 'WRITING (Original Story)', 'WRITING (Title Writing)', 'SPECIAL AWARD', 'DIRECTING', 'WRITING', 'OUTSTANDING PRODUCTION', 'SOUND RECORDING', 'SHORT SUBJECT (Cartoon)', 'SHORT SUBJECT (Comedy)', 'SHORT SUBJECT (Novelty)', 'ASSISTANT DIRECTOR', 'FILM EDITING', 'MUSIC (Scoring)', 'MUSIC (Song)', 'DANCE DIRECTION', 'WRITING (Screenplay)', 'ACTOR IN A SUPPORTING ROLE', 'ACTRESS IN A SUPPORTING ROLE', 'SHORT SUBJECT (Color)', 'SHORT SUBJECT (One-reel)', 'SHORT SUBJECT (Two-reel)', 'IRVING G. THALBERG MEMORIAL AWARD', 'MUSIC (Original Score)', 'CINEMATOGRAPHY (Black-and-White)', 'CINEMATOGRAPHY (Color)', 'SPECIAL EFFECTS', 'ART DIRECTION (Black-and-White)', 'ART DIRECTION (Color)', 'WRITING (Original Screenplay)', 'DOCUMENTARY (Short Subject)', 'MUSIC (Music Score of a Dramatic

In [9]:
oscar_awards_df.loc[:, 'subcategory'] = oscar_awards_df['category'].str.extract(r'\s*\((.*?)\)\s*')
oscar_awards_df.loc[:, 'category'] = oscar_awards_df['category'].str.replace(r'\s*\(.*?\)', '', regex=True)

oscar_awards_df[['category', 'subcategory']].head(200)

Unnamed: 0,category,subcategory
0,ACTOR,
1,ACTOR,
2,ACTRESS,
3,ACTRESS,
4,ACTRESS,
...,...,...
195,SOUND RECORDING,
196,WRITING,Adaptation
197,WRITING,Adaptation
198,WRITING,Adaptation


In [10]:
print(list_of_categories)

['ACTOR', 'ACTRESS', 'ART DIRECTION', 'CINEMATOGRAPHY', 'DIRECTING (Comedy Picture)', 'DIRECTING (Dramatic Picture)', 'ENGINEERING EFFECTS', 'OUTSTANDING PICTURE', 'UNIQUE AND ARTISTIC PICTURE', 'WRITING (Adaptation)', 'WRITING (Original Story)', 'WRITING (Title Writing)', 'SPECIAL AWARD', 'DIRECTING', 'WRITING', 'OUTSTANDING PRODUCTION', 'SOUND RECORDING', 'SHORT SUBJECT (Cartoon)', 'SHORT SUBJECT (Comedy)', 'SHORT SUBJECT (Novelty)', 'ASSISTANT DIRECTOR', 'FILM EDITING', 'MUSIC (Scoring)', 'MUSIC (Song)', 'DANCE DIRECTION', 'WRITING (Screenplay)', 'ACTOR IN A SUPPORTING ROLE', 'ACTRESS IN A SUPPORTING ROLE', 'SHORT SUBJECT (Color)', 'SHORT SUBJECT (One-reel)', 'SHORT SUBJECT (Two-reel)', 'IRVING G. THALBERG MEMORIAL AWARD', 'MUSIC (Original Score)', 'CINEMATOGRAPHY (Black-and-White)', 'CINEMATOGRAPHY (Color)', 'SPECIAL EFFECTS', 'ART DIRECTION (Black-and-White)', 'ART DIRECTION (Color)', 'WRITING (Original Screenplay)', 'DOCUMENTARY (Short Subject)', 'MUSIC (Music Score of a Dramatic

In [11]:
print(list(oscar_awards_df['subcategory'].unique()))

[<NA>, 'Comedy Picture', 'Dramatic Picture', 'Adaptation', 'Original Story', 'Title Writing', 'Cartoon', 'Comedy', 'Novelty', 'Scoring', 'Song', 'Screenplay', 'Color', 'One-reel', 'Two-reel', 'Original Score', 'Black-and-White', 'Original Screenplay', 'Short Subject', 'Music Score of a Dramatic Picture', 'Scoring of a Musical Picture', 'Music Score of a Dramatic or Comedy Picture', 'Original Motion Picture Story', 'Feature', 'Motion Picture Story', 'Story and Screenplay', 'Screenplay--Adapted', 'Screenplay--Original', 'Live Action', 'Screenplay--based on material from another medium', 'Story and Screenplay--written directly for the screen', 'Music Score--substantially original', 'Scoring of Music--adaptation or treatment', 'Original Music Score', 'Original Score--for a motion picture [not a musical]', 'Score of a Musical Picture--original or adaptation', 'Song--Original for the Picture', 'Story and Screenplay--based on material not previously published or produced', 'Original Song Scor

## ARE THERE NA VALUES IN THE DATASET?

In [12]:
not_available_year_movie = oscar_awards_df['year_movie'].isna().sum()
not_available_year_ceremony = oscar_awards_df['year_ceremony'].isna().sum()
not_available_category = oscar_awards_df['category'].isna().sum()
not_available_ceremony = oscar_awards_df['ceremony'].isna().sum()
not_available_name = oscar_awards_df['name'].isna().sum()
not_available_movie = oscar_awards_df['movie'].isna().sum()
not_available_winner = oscar_awards_df['winner'].isna().sum()

In [13]:
print('not available year films: ', not_available_year_movie,
      '\nnot available year ceremony: ', not_available_year_ceremony,
      '\nnot available category: ', not_available_category,
      '\nnot available ceremony: ', not_available_ceremony,
      '\nnot available names: ', not_available_name,
      '\nnot available film: ', not_available_movie,
      '\nnot available winner: ', not_available_winner)

not available year films:  0 
not available year ceremony:  0 
not available category:  0 
not available ceremony:  0 
not available names:  5 
not available film:  319 
not available winner:  0


## ARE THERE ANY DUPLICATES?

In [14]:
oscar_awards_df.loc[oscar_awards_df['movie'].isna()]

Unnamed: 0,year_movie,year_ceremony,ceremony,category,name,movie,winner,subcategory
16,1927,1928,1,ENGINEERING EFFECTS,Ralph Hammeras,,False,
18,1927,1928,1,ENGINEERING EFFECTS,Nugent Slaughter,,False,
31,1927,1928,1,WRITING,Joseph Farnham,,True,Title Writing
32,1927,1928,1,WRITING,"George Marion, Jr.",,False,Title Writing
33,1927,1928,1,SPECIAL AWARD,Warner Bros.,,True,
...,...,...,...,...,...,...,...,...
10763,2022,2023,95,GORDON E. SAWYER AWARD,Iain Neil,,True,
10885,2023,2024,96,JEAN HERSHOLT HUMANITARIAN AWARD,,,True,
10886,2023,2024,96,HONORARY AWARD,"To Angela Bassett, who has inspired audiences ...",,True,
10887,2023,2024,96,HONORARY AWARD,"To Mel Brooks, for his comedic brilliance, pro...",,True,


In [15]:
num_duplicated_oscars = oscar_awards_df.duplicated().sum()
print('There are a total of {} duplicates '.format(num_duplicated_oscars))

There are a total of 7 duplicates 


In [16]:
oscar_awards_df.loc[oscar_awards_df.duplicated(keep=False)]

Unnamed: 0,year_movie,year_ceremony,ceremony,category,name,movie,winner,subcategory
6219,1983,1984,56,MUSIC,Music by Michel Legrand; Lyric by Alan Bergman...,Yentl,False,Original Song
6220,1983,1984,56,MUSIC,Music by Michel Legrand; Lyric by Alan Bergman...,Yentl,False,Original Song
7066,1991,1992,64,MUSIC,Music by Alan Menken; Lyric by Howard Ashman,Beauty and the Beast,False,Original Song
7068,1991,1992,64,MUSIC,Music by Alan Menken; Lyric by Howard Ashman,Beauty and the Beast,False,Original Song
7394,1994,1995,67,MUSIC,Music by Elton John; Lyric by Tim Rice,The Lion King,False,Original Song
7395,1994,1995,67,MUSIC,Music by Elton John; Lyric by Tim Rice,The Lion King,False,Original Song
8862,2007,2008,80,MUSIC,Music by Alan Menken; Lyric by Stephen Schwartz,Enchanted,False,Original Song
8864,2007,2008,80,MUSIC,Music by Alan Menken; Lyric by Stephen Schwartz,Enchanted,False,Original Song
8865,2007,2008,80,MUSIC,Music by Alan Menken; Lyric by Stephen Schwartz,Enchanted,False,Original Song
9090,2009,2010,82,MUSIC,Music and Lyric by Randy Newman,The Princess and the Frog,False,Original Song


In [17]:
# removing duplicates
oscar_awards_df = oscar_awards_df[~oscar_awards_df.duplicated()]
oscar_awards_df.loc[oscar_awards_df.duplicated()]

Unnamed: 0,year_movie,year_ceremony,ceremony,category,name,movie,winner,subcategory


## ARE THERE ANY UNREASONABLE VALUES?
We noticed that some of the values in 'name' column have a leading space. We should remove that to avoid errors

In [18]:
oscar_awards_df.query('name == "Warner Bros."')

Unnamed: 0,year_movie,year_ceremony,ceremony,category,name,movie,winner,subcategory
102,1929,1930,3,OUTSTANDING PRODUCTION,Warner Bros.,Disraeli,False,
239,1932,1933,6,OUTSTANDING PRODUCTION,Warner Bros.,42nd Street,False,
240,1932,1933,6,OUTSTANDING PRODUCTION,Warner Bros.,I Am a Fugitive from a Chain Gang,False,
298,1934,1935,7,OUTSTANDING PRODUCTION,Warner Bros.,Here Comes the Navy,False,
311,1934,1935,7,SHORT SUBJECT,Warner Bros.,"What, No Men!",False,Comedy
383,1935,1936,8,OUTSTANDING PRODUCTION,Warner Bros.,A Midsummer Night's Dream,False,
479,1936,1937,9,OUTSTANDING PRODUCTION,Warner Bros.,Anthony Adverse,False,
492,1936,1937,9,SHORT SUBJECT,Warner Bros.,Give Me Liberty,True,Color
498,1936,1937,9,SHORT SUBJECT,Warner Bros.,Double or Nothing,False,Two-reel
603,1937,1938,10,OUTSTANDING PRODUCTION,Warner Bros.,The Life of Emile Zola,True,


In [19]:
oscar_awards_df.query('name == " Warner Bros."')

Unnamed: 0,year_movie,year_ceremony,ceremony,category,name,movie,winner,subcategory
33,1927,1928,1,SPECIAL AWARD,Warner Bros.,,True,


In [20]:
oscar_awards_df.loc[:, 'name'] = oscar_awards_df['name'].str.lstrip()
oscar_awards_df.query('name == " Warner Bros."')

Unnamed: 0,year_movie,year_ceremony,ceremony,category,name,movie,winner,subcategory


now we see how dates values are distributed in the dataset

In [21]:
oscar_awards_df['year_movie'].describe()

count    10882.000000
mean      1976.592538
std         27.358105
min       1927.000000
25%       1952.000000
50%       1976.000000
75%       2001.000000
max       2023.000000
Name: year_movie, dtype: float64

In [22]:
oscar_awards_df['year_ceremony'].describe()

count    10882.000000
mean      1977.592538
std         27.358105
min       1928.000000
25%       1953.000000
50%       1977.000000
75%       2002.000000
max       2024.000000
Name: year_ceremony, dtype: float64

values seem to be reasonable, considering they are exactly one year higher

now we could check if there are any films which seem to have a release year bigger then the year of the ceremony:

In [23]:
movies_bigger_than_ceremony_year = oscar_awards_df.query('year_movie > year_ceremony')
if movies_bigger_than_ceremony_year.empty:
    print('There are no values where year of movie release is bigger than ceremony year.')
else:
    print(movies_bigger_than_ceremony_year)

There are no values where year of movie release is bigger than ceremony year.


since the dataset holds oscars nominations until 2024, we should expect to have no ceremony values over 96 (since 2024 was the 96th edition)

In [24]:
bigger_ceremony = oscar_awards_df.query('ceremony > 96')
if bigger_ceremony.empty:
    print('There is no ceremony higher than the 96th edition.')
else:
    print(bigger_ceremony)

There is no ceremony higher than the 96th edition.


## SAVING THE CLEANED DATASET

In [25]:
oscar_awards_df.to_csv('data_cleaned/oscar_awards_cleaned.csv', index=False)

In [26]:
oscar_awards_df = None