<a href="https://colab.research.google.com/github/rafaelbraf/oscar-winners-notebook/blob/master/oscar_winners.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
%matplotlib inline

import numpy as np
import os
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
# importando o arquivo
oscar_winners = pd.read_csv('Oscars-demographics-DFE.csv', encoding='latin1', sep=',')
oscar_winners.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,birthplace,birthplace:confidence,date_of_birth,date_of_birth:confidence,race_ethnicity,...,award,biourl,birthplace_gold,date_of_birth_gold,movie,person,race_ethnicity_gold,religion_gold,sexual_orientation_gold,year_of_award_gold
0,670454353,False,finalized,3,2/10/15 3:45,"Chisinau, Moldova",1.0,30-Sep-1895,1.0,White,...,Best Director,http://www.nndb.com/people/320/000043191/,,,Two Arabian Knights,Lewis Milestone,,,,
1,670454354,False,finalized,3,2/10/15 2:03,"Glasgow, Scotland",1.0,2-Feb-1886,1.0,White,...,Best Director,http://www.nndb.com/people/626/000042500/,,,The Divine Lady,Frank Lloyd,,,,
2,670454355,False,finalized,3,2/10/15 2:05,"Chisinau, Moldova",1.0,30-Sep-1895,1.0,White,...,Best Director,http://www.nndb.com/people/320/000043191/,,,All Quiet on the Western Front,Lewis Milestone,,,,
3,670454356,False,finalized,3,2/10/15 2:04,"Chicago, Il",1.0,23-Feb-1899,1.0,White,...,Best Director,http://www.nndb.com/people/544/000041421/,,,Skippy,Norman Taurog,,,,
4,670454357,False,finalized,3,2/10/15 1:48,"Salt Lake City, Ut",1.0,23-Apr-1894,1.0,White,...,Best Director,http://www.nndb.com/people/292/000044160/,,,Bad Girl,Frank Borzage,,,,


In [None]:
# verificando a quantidade de linhas e colunas
oscar_winners.shape

(441, 27)

In [None]:
# removendo duplicatas e verificando a quantidade
oscar_winners = oscar_winners.drop_duplicates()
oscar_winners.shape

(441, 27)

In [None]:
# verificando a quantidade de valores nulos que existem na base
oscar_winners.isnull().sum()

_unit_id                           0
_golden                            0
_unit_state                        0
_trusted_judgments                 0
_last_judgment_at                 25
birthplace                         0
birthplace:confidence              0
date_of_birth                      0
date_of_birth:confidence           0
race_ethnicity                     0
race_ethnicity:confidence          0
religion                           0
religion:confidence                0
sexual_orientation                 0
sexual_orientation:confidence      0
year_of_award                      0
year_of_award:confidence           0
award                              0
biourl                             0
birthplace_gold                  432
date_of_birth_gold               433
movie                              0
person                             0
race_ethnicity_gold              439
religion_gold                    431
sexual_orientation_gold          438
year_of_award_gold               430
d

In [None]:
# criando colunas para marcar colunas que tinham valores nulos
oscar_winners['_last_judgment_at_nulls'] = np.where(oscar_winners['_last_judgment_at'].isnull(), 'null', 'n/a')
oscar_winners['birthplace_gold_nulls'] = np.where(oscar_winners['birthplace_gold'].isnull(), 'null', 'n/a')
oscar_winners['date_of_birth_gold_nulls'] = np.where(oscar_winners['date_of_birth_gold'].isnull(), 'null', 'n/a')
oscar_winners['race_ethnicity_gold_nulls'] = np.where(oscar_winners['race_ethnicity_gold'].isnull(), 'null', 'n/a')
oscar_winners['religion_gold_nulls'] = np.where(oscar_winners['religion_gold'].isnull(), 'null', 'n/a')
oscar_winners['sexual_orientation_gold_nulls'] = np.where(oscar_winners['sexual_orientation_gold'].isnull(), 'null', 'n/a')
oscar_winners['year_of_award_gold_nulls'] = np.where(oscar_winners['year_of_award_gold'].isnull(), 'null', 'n/a')

# verificando a base
oscar_winners.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,birthplace,birthplace:confidence,date_of_birth,date_of_birth:confidence,race_ethnicity,...,religion_gold,sexual_orientation_gold,year_of_award_gold,_last_judgment_at_nulls,birthplace_gold_nulls,date_of_birth_gold_nulls,race_ethnicity_gold_nulls,religion_gold_nulls,sexual_orientation_gold_nulls,year_of_award_gold_nulls
0,670454353,False,finalized,3,2/10/15 3:45,"Chisinau, Moldova",1.0,30-Sep-1895,1.0,White,...,,,,,,,,,,
1,670454354,False,finalized,3,2/10/15 2:03,"Glasgow, Scotland",1.0,2-Feb-1886,1.0,White,...,,,,,,,,,,
2,670454355,False,finalized,3,2/10/15 2:05,"Chisinau, Moldova",1.0,30-Sep-1895,1.0,White,...,,,,,,,,,,
3,670454356,False,finalized,3,2/10/15 2:04,"Chicago, Il",1.0,23-Feb-1899,1.0,White,...,,,,,,,,,,
4,670454357,False,finalized,3,2/10/15 1:48,"Salt Lake City, Ut",1.0,23-Apr-1894,1.0,White,...,,,,,,,,,,


In [None]:
# substituindo os valores nulos
# as colunas last_judgement, date_of_birth  year_of_award serão substituídas por 0
# as demais colunas serão substituídas por "N/A"
oscar_winners['_last_judgment_at'] = oscar_winners['_last_judgment_at'].fillna(0)
oscar_winners['date_of_birth_gold'] = oscar_winners['date_of_birth_gold'].fillna(0)
oscar_winners['year_of_award_gold'] = oscar_winners['year_of_award_gold'].fillna(0)

oscar_winners['birthplace_gold'] = oscar_winners['birthplace_gold'].fillna('N/A')
oscar_winners['race_ethnicity_gold'] = oscar_winners['race_ethnicity_gold'].fillna('N/A')
oscar_winners['religion_gold'] = oscar_winners['religion_gold'].fillna('N/A')
oscar_winners['sexual_orientation_gold'] = oscar_winners['sexual_orientation_gold'].fillna('N/A')

# verificando se realmente todas as células nulas foram preenchidas
oscar_winners.isnull().sum()

_unit_id                         0
_golden                          0
_unit_state                      0
_trusted_judgments               0
_last_judgment_at                0
birthplace                       0
birthplace:confidence            0
date_of_birth                    0
date_of_birth:confidence         0
race_ethnicity                   0
race_ethnicity:confidence        0
religion                         0
religion:confidence              0
sexual_orientation               0
sexual_orientation:confidence    0
year_of_award                    0
year_of_award:confidence         0
award                            0
biourl                           0
birthplace_gold                  0
date_of_birth_gold               0
movie                            0
person                           0
race_ethnicity_gold              0
religion_gold                    0
sexual_orientation_gold          0
year_of_award_gold               0
_last_judgment_at_nulls          0
birthplace_gold_null

In [None]:
# colocando ambas as colunas last judgement at e date of birth em formato de data
oscar_winners['_last_judgment_at'] = pd.to_datetime(oscar_winners['_last_judgment_at'])
oscar_winners['date_of_birth_gold'] = pd.to_datetime(oscar_winners['date_of_birth_gold'])

# verificando se o ajuste foi bem sucedido
oscar_winners.head()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,birthplace,birthplace:confidence,date_of_birth,date_of_birth:confidence,race_ethnicity,...,religion_gold,sexual_orientation_gold,year_of_award_gold,_last_judgment_at_nulls,birthplace_gold_nulls,date_of_birth_gold_nulls,race_ethnicity_gold_nulls,religion_gold_nulls,sexual_orientation_gold_nulls,year_of_award_gold_nulls
0,670454353,False,finalized,3,2015-02-10 03:45:00,"Chisinau, Moldova",1.0,30-Sep-1895,1.0,White,...,,,0.0,,,,,,,
1,670454354,False,finalized,3,2015-02-10 02:03:00,"Glasgow, Scotland",1.0,2-Feb-1886,1.0,White,...,,,0.0,,,,,,,
2,670454355,False,finalized,3,2015-02-10 02:05:00,"Chisinau, Moldova",1.0,30-Sep-1895,1.0,White,...,,,0.0,,,,,,,
3,670454356,False,finalized,3,2015-02-10 02:04:00,"Chicago, Il",1.0,23-Feb-1899,1.0,White,...,,,0.0,,,,,,,
4,670454357,False,finalized,3,2015-02-10 01:48:00,"Salt Lake City, Ut",1.0,23-Apr-1894,1.0,White,...,,,0.0,,,,,,,


In [None]:
# renomeando colunas
oscar_winners = oscar_winners.rename(columns={'_unit_id': 'unit_id', '_golden': 'golden', '_unit_state': 'unit_state', '_trusted_judgments': 'trusted_judgments'})
oscar_winners.head()

Unnamed: 0,unit_id,golden,unit-state,trusted_judgments,_last_judgment_at,birthplace,birthplace:confidence,date_of_birth,date_of_birth:confidence,race_ethnicity,...,religion_gold,sexual_orientation_gold,year_of_award_gold,_last_judgment_at_nulls,birthplace_gold_nulls,date_of_birth_gold_nulls,race_ethnicity_gold_nulls,religion_gold_nulls,sexual_orientation_gold_nulls,year_of_award_gold_nulls
0,670454353,False,finalized,3,2015-02-10 03:45:00,"Chisinau, Moldova",1.0,30-Sep-1895,1.0,White,...,,,0.0,,,,,,,
1,670454354,False,finalized,3,2015-02-10 02:03:00,"Glasgow, Scotland",1.0,2-Feb-1886,1.0,White,...,,,0.0,,,,,,,
2,670454355,False,finalized,3,2015-02-10 02:05:00,"Chisinau, Moldova",1.0,30-Sep-1895,1.0,White,...,,,0.0,,,,,,,
3,670454356,False,finalized,3,2015-02-10 02:04:00,"Chicago, Il",1.0,23-Feb-1899,1.0,White,...,,,0.0,,,,,,,
4,670454357,False,finalized,3,2015-02-10 01:48:00,"Salt Lake City, Ut",1.0,23-Apr-1894,1.0,White,...,,,0.0,,,,,,,
