In [2]:
# Importar bibliotecas necessárias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:

# 1. Carregar o dataset
url = 'https://waf.cs.illinois.edu/discovery/berkeley.csv'
df = pd.read_csv(url)
df

Unnamed: 0,Year,Major,Gender,Admission
0,1973,C,F,Rejected
1,1973,B,M,Accepted
2,1973,Other,F,Accepted
3,1973,Other,M,Accepted
4,1973,Other,M,Rejected
...,...,...,...,...
12758,1973,Other,M,Accepted
12759,1973,D,M,Accepted
12760,1973,Other,F,Rejected
12761,1973,Other,M,Rejected


In [5]:
# 2. Verificar suas informações gerais
print("Informações gerais do dataset:")
print(df.info())
print(df.describe())

Informações gerais do dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12763 entries, 0 to 12762
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Year       12763 non-null  int64 
 1   Major      12763 non-null  object
 2   Gender     12763 non-null  object
 3   Admission  12763 non-null  object
dtypes: int64(1), object(3)
memory usage: 399.0+ KB
None
          Year
count  12763.0
mean    1973.0
std        0.0
min     1973.0
25%     1973.0
50%     1973.0
75%     1973.0
max     1973.0


In [6]:
# 3. Verificar dados ausentes
print("\nDados ausentes:")
print(df.isnull().sum())


Dados ausentes:
Year         0
Major        0
Gender       0
Admission    0
dtype: int64


In [18]:
# 4. Verificar outliers no DataFrame
# Usando a mediana e o Q1 para detectar valores fora do rango normal

Q1 = df['Year'].quantile(0.25)
Q3 = df['Year'].quantile(0.75)

IQR = Q3 - Q1

outliers = df[(df['Year'] < (Q1 - 1.5 * IQR)) | (df['Year'] > (Q3 + 1.5 * IQR))]

print("\nOutliers detectados no DataFrame:")
print(outliers)


Outliers detectados no DataFrame:
Empty DataFrame
Columns: [Year, Major, Gender, Admission, Admissao]
Index: []


In [11]:
# 5. Verificar e remover valores repetidos ou duplicados
# Utilizar um método para verificar inconsistências, como a verificação de valores duplicados ou repetidos
print("\nInconsistências:")
print(df.duplicated().sum())
df = df.drop_duplicates()
df



Inconsistências:
0


Unnamed: 0,Year,Major,Gender,Admission
0,1973,C,F,Rejected
1,1973,B,M,Accepted
2,1973,Other,F,Accepted
3,1973,Other,M,Accepted
4,1973,Other,M,Rejected
6,1973,F,F,Accepted
9,1973,A,M,Accepted
10,1973,Other,F,Rejected
12,1973,C,M,Rejected
13,1973,A,M,Rejected


In [None]:
# 6. Criar nova variável "admissão"
# Utilizar um método para criar a nova variável, com base nos valores da coluna "Admission"
df['Admissao'] = np.where(df['Admission'] == 'Rejected', 'Reprovado',
                         np.where(df['Admission'] == 'Accepted', 'Aprovado', ''))

print("\nNova variável criada:")
print(df.info())
df


Nova variável criada:
<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 0 to 1436
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Year       28 non-null     int64 
 1   Major      28 non-null     object
 2   Gender     28 non-null     object
 3   Admission  28 non-null     object
 4   Admissao   28 non-null     object
dtypes: int64(1), object(4)
memory usage: 1.3+ KB
None


Unnamed: 0,Year,Major,Gender,Admission,Admissao
0,1973,C,F,Rejected,Reprovado
1,1973,B,M,Accepted,Aprovado
2,1973,Other,F,Accepted,Aprovado
3,1973,Other,M,Accepted,Aprovado
4,1973,Other,M,Rejected,Reprovado
6,1973,F,F,Accepted,Aprovado
9,1973,A,M,Accepted,Aprovado
10,1973,Other,F,Rejected,Reprovado
12,1973,C,M,Rejected,Reprovado
13,1973,A,M,Rejected,Reprovado


In [16]:

# 7. Converter tipos de dados
# Exemplo de conversão do tipo de dados do campo 'Data' para datetime
# No Python padrão, não existe uma função como a `to_datetime` do pandas,
# então devemos converter isso manualmente.
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
df


Unnamed: 0,Year,Major,Gender,Admission,Admissao
0,1973-01-01,C,F,Rejected,Reprovado
1,1973-01-01,B,M,Accepted,Aprovado
2,1973-01-01,Other,F,Accepted,Aprovado
3,1973-01-01,Other,M,Accepted,Aprovado
4,1973-01-01,Other,M,Rejected,Reprovado
6,1973-01-01,F,F,Accepted,Aprovado
9,1973-01-01,A,M,Accepted,Aprovado
10,1973-01-01,Other,F,Rejected,Reprovado
12,1973-01-01,C,M,Rejected,Reprovado
13,1973-01-01,A,M,Rejected,Reprovado


In [17]:

# 7. Salvar o dataset pré-processado
df.to_csv('dataset_pre_processado.csv', index=False)