# Preparação do Ambiente

In [None]:
!pip install gdown



In [None]:
import gdown

In [None]:
import numpy as np
from numpy import nan as NA
import pandas as pd

# Inspecionando o Dataset

In [None]:
data = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])

In [None]:
data.head(2)

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


In [None]:
data.tail(2)

Unnamed: 0,0,1,2
1,4,5,6
2,7,8,9


In [None]:
data.columns.values

array([0, 1, 2])

In [None]:
data.columns = ['A', 'B', 'C']

In [None]:
data.columns.values #list(data)

array(['A', 'B', 'C'], dtype=object)

In [None]:
data.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [None]:
data.max()

Unnamed: 0,0
A,7
B,8
C,9


In [None]:
data.max(axis=1) # Máximo por linhas

Unnamed: 0,0
0,3
1,6
2,9


In [None]:
data.mean(axis=0) # axis=1

Unnamed: 0,0
A,4.0
B,5.0
C,6.0


## Seleção de elementos

In [None]:
data[['A','B']]

Unnamed: 0,A,B
0,1,2
1,4,5
2,7,8


In [None]:
data[0:2][['A']] # Slices contínuos, podemos utilizar linhas ou colunas

Unnamed: 0,A
0,1
1,4


Utilizando as operações de `loc` e `iloc`

In [None]:
data = pd.DataFrame(np.arange(30).reshape(6,5))
data.columns=['A', 'B', 'C', 'D', 'E']
data

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24
5,25,26,27,28,29


In [None]:
data.loc[0:3]

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [None]:
data.loc[[1,3,5]]

Unnamed: 0,A,B,C,D,E
1,5,6,7,8,9
3,15,16,17,18,19
5,25,26,27,28,29


In [None]:
data.loc[[1,2,3],'A':'C']

Unnamed: 0,A,B,C
1,5,6,7
2,10,11,12
3,15,16,17


In [None]:
data.iloc[0:3] # Índice e não nomes

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14


In [None]:
data2 = data.sample(frac=1)
data2

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
5,25,26,27,28,29
3,15,16,17,18,19
1,5,6,7,8,9
2,10,11,12,13,14
4,20,21,22,23,24


In [None]:
data.loc[[0,1,3]]

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
3,15,16,17,18,19


In [None]:
display(data.iloc[0:1])
display(data.loc[0:0])

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4


Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4


Selecionando utilizando condições

In [None]:
display(data.loc[data['A'] > 5, ['B','C']])
display(data.iloc[list(data['A'] > 5), [1,2]])

Unnamed: 0,B,C
2,11,12
3,16,17
4,21,22
5,26,27


Unnamed: 0,B,C
2,11,12
3,16,17
4,21,22
5,26,27


In [None]:
display(data[data["A"].isin([5,15,10])])

Unnamed: 0,A,B,C,D,E
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


# Lidando com dados ausentes

Como resolver problemas com os `NA`s

In [None]:
data = pd.DataFrame([[1, 6.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 6.5,3]])

In [None]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [None]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [None]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


Fazendo o drop nas colunas

In [None]:
data[3] = NA
data

Unnamed: 0,0,1,2,3
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [None]:
data.dropna(axis=1, how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Utilizando um threshold

In [None]:
df = pd.DataFrame(np.random.randn(7,3))

In [None]:
df

Unnamed: 0,0,1,2
0,-1.09451,0.412616,0.610463
1,-0.391866,0.194139,0.018147
2,-1.449297,2.19516,-0.567148
3,0.420827,-0.939561,-0.007653
4,0.569556,-0.367462,-0.563609
5,0.707988,-0.702953,0.078866
6,0.864273,1.282425,0.979933


Introduzir NAs

In [None]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA

In [None]:
df

Unnamed: 0,0,1,2
0,-1.09451,,
1,-0.391866,,
2,-1.449297,,-0.567148
3,0.420827,,-0.007653
4,0.569556,-0.367462,-0.563609
5,0.707988,-0.702953,0.078866
6,0.864273,1.282425,0.979933


In [None]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.449297,,-0.567148
3,0.420827,,-0.007653
4,0.569556,-0.367462,-0.563609
5,0.707988,-0.702953,0.078866
6,0.864273,1.282425,0.979933


# Preenchimento de valores faltantes

In [None]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.09451,0.0,0.0
1,-0.391866,0.0,0.0
2,-1.449297,0.0,-0.567148
3,0.420827,0.0,-0.007653
4,0.569556,-0.367462,-0.563609
5,0.707988,-0.702953,0.078866
6,0.864273,1.282425,0.979933


In [None]:
df.fillna({1: 0.5, 2:0})

Unnamed: 0,0,1,2
0,-1.09451,0.5,0.0
1,-0.391866,0.5,0.0
2,-1.449297,0.5,-0.567148
3,0.420827,0.5,-0.007653
4,0.569556,-0.367462,-0.563609
5,0.707988,-0.702953,0.078866
6,0.864273,1.282425,0.979933


Substituir pela média

In [None]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,-1.09451,0.07067,-0.015922
1,-0.391866,0.07067,-0.015922
2,-1.449297,0.07067,-0.567148
3,0.420827,0.07067,-0.007653
4,0.569556,-0.367462,-0.563609
5,0.707988,-0.702953,0.078866
6,0.864273,1.282425,0.979933


# Lidando com valores duplicados

In [None]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1,1,2,3,3,4,4]})

In [None]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [None]:
data.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,False
6,True


In [None]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [None]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [None]:
data.drop_duplicates(['k1'], keep='last')

Unnamed: 0,k1,k2
4,one,3
6,two,4


# Atualizando Valores

In [None]:
data = pd.DataFrame([[1, -999, 2, -999, 1000, 3],[1, -999, 2, -999, 1000, 3]])

In [None]:
data

Unnamed: 0,0,1,2,3,4,5
0,1,-999,2,-999,1000,3
1,1,-999,2,-999,1000,3


In [None]:
data = data.replace(-999, NA)

In [None]:
data = data.replace({-999: NA, 1000: 0})

In [None]:
data.iat[0,1] = 100
display(data)

Unnamed: 0,0,1,2,3,4,5
0,1,100.0,2,,0,3
1,1,,2,,0,3


In [None]:
data.at[0,1] = 100
display(data)

Unnamed: 0,0,1,2,3,4,5
0,1,100.0,2,,0,3
1,1,,2,,0,3


In [None]:
data.iloc[:, 4] = np.array([5] * len(data))
display(data)

Unnamed: 0,0,1,2,3,4,5
0,1,100.0,2,,5,3
1,1,,2,,5,3


# Aplicando funções

Utilizando o dataset do [imdb](https://www.kaggle.com/datasets/arensonz/imdb-extensive-dataset-reupload) disponível no kaggle.

In [None]:
!gdown https://drive.google.com/uc?id=1LyuzFyiMpfDENpuuH6k9raOSDQZYGz19

Downloading...
From: https://drive.google.com/uc?id=1LyuzFyiMpfDENpuuH6k9raOSDQZYGz19
To: /content/IMDb movies.csv
100% 47.9M/47.9M [00:00<00:00, 120MB/s]


In [None]:
!ls

'IMDb movies.csv'   sample_data


In [None]:
imdb_dataset = pd.read_csv('IMDb movies.csv', index_col='title')

  imdb_dataset = pd.read_csv('IMDb movies.csv', index_col='title')


In [None]:
imdb_dataset['avg_vote']

Unnamed: 0_level_0,avg_vote
title,Unnamed: 1_level_1
Miss Jerry,5.9
The Story of the Kelly Gang,6.1
Den sorte drøm,5.8
Cleopatra,5.2
L'Inferno,7.0
...,...
Le lion,5.3
De Beentjes van Sint-Hildegard,7.7
Padmavyuhathile Abhimanyu,7.9
Sokagin Çocuklari,6.4


In [None]:
def classificar_filme(nota_filme):
    if nota_filme >= 8.0:
        return "bom"
    else:
        return "ruim"

In [None]:
imdb_dataset["avg_vote"].apply(classificar_filme)

Unnamed: 0_level_0,avg_vote
title,Unnamed: 1_level_1
Miss Jerry,ruim
The Story of the Kelly Gang,ruim
Den sorte drøm,ruim
Cleopatra,ruim
L'Inferno,ruim
...,...
Le lion,ruim
De Beentjes van Sint-Hildegard,ruim
Padmavyuhathile Abhimanyu,ruim
Sokagin Çocuklari,ruim


In [None]:
data = pd.DataFrame({"filme": ["Miss Jerry","The Story of the Kelly Gang","Cleopatra"],
                     "n1": [2, 5, 10],
                     "n2": [10, 5, 8]})

In [None]:
data

Unnamed: 0,filme,n1,n2
0,Miss Jerry,2,10
1,The Story of the Kelly Gang,5,5
2,Cleopatra,10,8


In [None]:
data.loc[:,'filme':]

Unnamed: 0,filme,n1,n2
0,Miss Jerry,2,10
1,The Story of the Kelly Gang,5,5
2,Cleopatra,10,8


# Discretização/binning

In [None]:
idades = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [None]:
bins = [18, 25, 35, 60, 100]

In [None]:
pd.cut(idades, bins)

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [None]:
cats = pd.cut(idades, bins)

In [None]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [None]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [None]:
pd.value_counts(cats)

  pd.value_counts(cats)


Unnamed: 0,count
"(18, 25]",5
"(25, 35]",3
"(35, 60]",3
"(60, 100]",1


In [None]:
grupos = ['Jovem', 'Adulto', 'Meia-Idade', 'Senior']

In [None]:
pd.cut(idades, bins, labels=grupos)

['Jovem', 'Jovem', 'Jovem', 'Adulto', 'Jovem', ..., 'Adulto', 'Senior', 'Meia-Idade', 'Meia-Idade', 'Adulto']
Length: 12
Categories (4, object): ['Jovem' < 'Adulto' < 'Meia-Idade' < 'Senior']

# Detecção de Outliers

In [None]:
data = pd.DataFrame(np.random.randn(1000,4))

In [None]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.015885,-0.00151,-0.024013,0.019727
std,1.009024,1.005657,0.994713,0.997402
min,-4.186861,-3.254223,-3.024868,-2.786231
25%,-0.717625,-0.697007,-0.697097,-0.686608
50%,-0.029664,-0.009134,0.005833,0.039646
75%,0.62417,0.700025,0.613441,0.702607
max,3.592436,3.009762,3.394061,3.041399


In [None]:
col = data[2]

Procurando valores (em módulo) maiores do que 3, na coluna de índice `2`

In [None]:
col

Unnamed: 0,2
0,-0.666752
1,0.165461
2,2.421860
3,0.997248
4,0.146751
...,...
995,1.001412
996,0.853473
997,0.717658
998,-0.710315


In [None]:
col[np.abs(col) > 3]

Unnamed: 0,2
76,3.394061
693,-3.024868


Generalizando para o dataset inteiro - seleciona linhas cujas colunas possuam **algum** valor maior do que 3.

In [None]:
data[(np.abs(data)>=3).any(axis=1)]

Unnamed: 0,0,1,2,3
76,-0.10785,-0.158295,3.394061,0.336407
145,1.096288,-3.254223,-1.402339,1.521437
384,-1.057115,-0.275694,2.256355,3.041399
558,-4.186861,0.183813,-1.375821,1.733852
574,0.173388,3.009762,-1.102566,-0.331275
671,3.592436,-1.695286,-1.839859,1.793979
693,0.854786,-0.496139,-3.024868,-0.996376
766,-3.005797,-0.525015,0.850424,-0.510143


Atribuir o valor de -3 ou +3 para os elementos dessas linhas selecionadas

In [None]:
data[(np.abs(data)>3).any(axis=1)] = np.sign(data) * 3

In [None]:
data[(np.abs(data)>=3).any(axis=1)]

Unnamed: 0,0,1,2,3
76,-3.0,-3.0,3.0,3.0
145,3.0,-3.0,-3.0,3.0
384,-3.0,-3.0,3.0,3.0
558,-3.0,3.0,-3.0,3.0
574,3.0,3.0,-3.0,-3.0
671,3.0,-3.0,-3.0,3.0
693,3.0,-3.0,-3.0,-3.0
766,-3.0,-3.0,3.0,-3.0


In [None]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.013244,-0.010299,-0.027768,0.019138
std,1.02356,1.029617,1.013091,1.023534
min,-3.0,-3.0,-3.0,-3.0
25%,-0.720126,-0.71278,-0.697097,-0.696354
50%,-0.029664,-0.009134,0.005833,0.039646
75%,0.632026,0.702718,0.613441,0.704316
max,3.0,3.0,3.0,3.0


# Amostragem

In [None]:
df = pd.DataFrame(np.arange(5*4).reshape(5,4))

In [None]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [None]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
4,16,17,18,19


In [None]:
df.sample(n=6, replace=True)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
4,16,17,18,19
1,4,5,6,7
0,0,1,2,3
0,0,1,2,3


# Consumo de dados via API REST

Dados Json:
```json
["valor1", "valor2"]
{"k1": 1,"k2": 50}
1
2
3
"valor1"
"valor2"
```

In [None]:
url="https://api.nasa.gov/neo/rest/v1/neo/browse?page=1&api_key=R8G33YMr5NaMUi0ROfJ4mUQK2EiGdbr24ZklZWs9"

In [None]:
import requests

In [None]:
resp = requests.get(url)

In [None]:
data = resp.json()

In [None]:
data['page']

{'size': 20, 'total_elements': 39319, 'total_pages': 1966, 'number': 1}

In [None]:
asteroids = {"id": [], "nome": [], "perigoso": [], "diametro_min": [],
             "diametro_max":[], "primeira_obs": [], "ultima_obs": []}

In [None]:
for asteroid in data['near_earth_objects']:
    asteroids['id'].append(asteroid['id'])
    asteroids['nome'].append(asteroid['name'])
    asteroids['perigoso'].append(asteroid['is_potentially_hazardous_asteroid'])
    asteroids['diametro_min'].append(asteroid['estimated_diameter']['kilometers']['estimated_diameter_min'])
    asteroids['diametro_max'].append(asteroid['estimated_diameter']['kilometers']['estimated_diameter_max'])
    asteroids['primeira_obs'].append(asteroid['orbital_data']['first_observation_date'])
    asteroids['ultima_obs'].append(asteroid['orbital_data']['last_observation_date'])

In [None]:
len(asteroids['id'])

20

In [None]:
asteroids_data = pd.DataFrame(asteroids, index=asteroids["id"],
                              columns=['nome', 'primeira_obs', 'ultima_obs',
                                       'diametro_min', 'diametro_max', 'perigoso'])

In [None]:
asteroids_data['perigoso'].describe()

Unnamed: 0,perigoso
count,20
unique,2
top,False
freq,13


# Exercícios


Para esse conjunto de exercícios vamos utilizar o conjunto de dados
disponibilizado no dataset [MovieLens 100k](https://grouplens.org/datasets/movielens/100k/)

O conjunto de dados do MovieLens foi coletados pelo GroupLens Research Project
na Universidade de Minnesota.

Este conjunto de dados consiste em:
* 100.000 classificações (1-5) de 943 usuários em 1.682 filmes.
* Cada usuário classificou pelo menos 20 filmes.

O dataset está dividido em diversos arquivos.
Utilizando a biblioteca [Pandas](https://pandas.pydata.org/), implemente
funções que realizem as seguintes tarefas:

#### Considerando os dados de avaliação dos usuários

1. Cálculo da média, desvio padrão e variância para o dataset de avaliações
   completo (por filme);
2. Cálculo de média, desvio padrão e variância para cada usuário (armazenar
   esses valores em novas colunas do dataset);
3. Encontrar indivíduos que avaliam filmes de forma mais uniforme, i.e.,
   avaliações estão próximo ao valor da média do indivíduo;

#### Considerando os dados sobre filmes

1. Criar [`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html?highlight=dataframe#pandas.DataFrame) que contenha informações sobre filmes:

| movie id | movie title | release date | video release date | IMDb URL |  unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western |
--------|---------|--------|--------|--------|--------|--------|---------|--------|----|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|
|2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|
|3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|
|...|

2. Identificar qual gênero de filme possui o maior número de exemplos;
3. Verificar se existem dados faltando

#### Criando novo dataset

1. Criar novo [`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html?highlight=dataframe#pandas.DataFrame) que condense informações sobre o gênero do filme:

| movie id | movie title | release date | video release date | IMDb URL | genre|
-----------|-----------|-----------|-----------|-----------|------------|
|1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|Animation,Children's,Comedy|
|2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|Action,Adventure,Thriller|
|3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|Thriller|
|...|

2. Adicionar colunas que armazenem dados para o total de avaliações, a soma das
   avaliações, média, valor máximo (e mínimo), desvio padrão e variância;
3. Mostrar filmes com maior (e menor) número de avaliações;
4. Normalização é uma das tarefas mais importantes quando estamos preparando um
   dataset para utilizar algoritmos de Machine Learning. Implementar as
   seguintes estratégias de normalização:
   * [Normalização min-max](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling_(min-max_normalization))
   * [Normalização pela média](https://en.wikipedia.org/wiki/Feature_scaling#Mean_normalization)
   * [Normalização Z-score](https://en.wikipedia.org/wiki/Feature_scaling#Standardization_(Z-score_Normalization))