In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# A simple pipeline of ETL - Everyday extraction of movies presented at cinemark SP.

# Extract

In [5]:
movie_links = ['https://www.cinemark.com.br/' + movie.find('a')['href'] for movie in soup.find_all('div', attrs={'class':'col-sm-6 col-md-3'})]
movie_links


NameError: name 'soup' is not defined

In [6]:
url = f'https://www.cinemark.com.br/sao-paulo/filmes/em-cartaz'

response = requests.get(url)
soup = BeautifulSoup(response.content)
movie_names = [movie.find('a')['title'].replace('Filme ','') for movie in soup.find_all('div', attrs={'class':'col-sm-6 col-md-3'})]
movie_links = ['https://www.cinemark.com.br/' + movie.find('a')['href'] for movie in soup.find_all('div', attrs={'class':'col-sm-6 col-md-3'})]

classificacao = []
trailers = []
duracao = []

# run through the movies and get some info from it.
df = pd.DataFrame()

for link in movie_links:
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    classificacao.append(soup.find('div', attrs={'class':'movie-details'}).find_all('span')[1].text)
    trailers.append(soup.find('div', attrs={'class':'movie-details'}).find_all('li')[1].find('a')['href'].replace('//','https://'))



df = pd.concat([df, pd.DataFrame({'name': movie_names,
                                  'trailers': trailers,
                                  })])

df

Unnamed: 0,name,trailers
0,A Maldição do Espelho,https://www.youtube.com/embed/bY46wtgQfEw
1,Aprendiz de Espiã,https://www.youtube.com/embed/bY46wtgQfEw
2,Bloodshot,https://www.youtube.com/embed/bY46wtgQfEw
3,O Oficial e o Espião,https://www.youtube.com/embed/bY46wtgQfEw
4,Solteira Quase Surtando,https://www.youtube.com/embed/bY46wtgQfEw
5,Terremoto,https://www.youtube.com/embed/bY46wtgQfEw
6,Dois Irmãos - Uma Jornada Fantástica,https://www.youtube.com/embed/bY46wtgQfEw
7,O Melhor Está Por Vir,https://www.youtube.com/embed/bY46wtgQfEw
8,Seberg Contra Todos,https://www.youtube.com/embed/bY46wtgQfEw
9,Vou Nadar Até Você,https://www.youtube.com/embed/bY46wtgQfEw


In [7]:
for i in movie_names:
    if sum(i == df.name) != 0:
        print('repetido')

repetido
repetido
repetido
repetido
repetido
repetido
repetido
repetido
repetido
repetido
repetido
repetido


In [8]:
"A Maldição do Espelho" == df.name

0      True
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
Name: name, dtype: bool

# Transform

Of course much more could be done here. This is just an example.

In [9]:
df['date'] = str(datetime.today().year).zfill(4) + str(datetime.today().month).zfill(2) + str(datetime.today().day).zfill(2)

In [10]:
df = df.reset_index(drop=True)

In [11]:
df

Unnamed: 0,name,trailers,date
0,A Maldição do Espelho,https://www.youtube.com/embed/bY46wtgQfEw,20200325
1,Aprendiz de Espiã,https://www.youtube.com/embed/bY46wtgQfEw,20200325
2,Bloodshot,https://www.youtube.com/embed/bY46wtgQfEw,20200325
3,O Oficial e o Espião,https://www.youtube.com/embed/bY46wtgQfEw,20200325
4,Solteira Quase Surtando,https://www.youtube.com/embed/bY46wtgQfEw,20200325
5,Terremoto,https://www.youtube.com/embed/bY46wtgQfEw,20200325
6,Dois Irmãos - Uma Jornada Fantástica,https://www.youtube.com/embed/bY46wtgQfEw,20200325
7,O Melhor Está Por Vir,https://www.youtube.com/embed/bY46wtgQfEw,20200325
8,Seberg Contra Todos,https://www.youtube.com/embed/bY46wtgQfEw,20200325
9,Vou Nadar Até Você,https://www.youtube.com/embed/bY46wtgQfEw,20200325


# Store in database (once a day)

_hint: Don't forget to create the cinemark database_

In [12]:
from sqlalchemy import create_engine

In [13]:
    engine = create_engine('postgresql+psycopg2://postgres:g0disl0vee@localhost/cinemark')
    conn = engine.connect()

In [14]:
df.to_sql('movies', conn, index=False, if_exists='append')

# Mission: Reestructure this simple process into a pipeline.

## <u>COOKIECUTTER</u>: Use cookiecutter to create your new structure of files

`cookiecutter https://github.com/aguiarandre/etl-pipelines`

This will create your pipeline's folder structure.

## <u>ORGANIZATION - USING .PY FILES</u>: Transform the above steps into a structured .py pipeline

Remember to separate the parameters on their own separate file. The connection on another.

## <u>DOCUMENTATION</u>: Document each function of your pipeline. Then use sphinx to create your code's documentation

 Go into the `your_project/docs` folder and `./make.bat html` or `./make html` (don't forget to run `pip install -r requirements.txt`

## <u>IDEMPOTENCY PRINCIPLE</u>: Avoid duplication in your database. Only perform the storage step if today's date is not there.

## BONUS: <u>MORE BENEFITS OF .PY FILES</u>: Create a scheduler to perform this task once a day.

Use **crontab** if you're a Mac user.
After allowing cron to have Full Disk Access on `Security & Privacy`, write in your crontab: 

> `* * * * * full/path/to/your/python/executable full/path/to/pipeline.py`

Use **task-scheduler** if you are on windows. Create a `run.bat` script on the same folder of your `pipeline.py`. Write inside: 

> `python.exe pipeline.py`

Then go to task-scheduler (Agendar Tarefas) and create a new task. Give it a name, a new trigger specifying times and a new action specifying the path/to/your/run.bat and fill in 'Start at' with the path to your /project/src folder where your pipeline lives in.