# Section 1: Trying out BeautifulSoup #
***

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
# Simple test to get website's HTML content and print 200, OK! if success or other response.

head = {"User-Agent":"Chrome"}
URL = 'https://www.imdb.com/list/ls574098070/'

response = requests.get(URL, headers=head)

# check if request was successful (status code == 200)
if response.status_code == 200:
    #html_content = response.content
    #print(html_content)
    print("200, OK!")
    #soup = BeautifulSoup(response.text)
    #print(soup.prettify())
else:
    print(response)

In [None]:
# Using BeautifulSoup to parse the HTML code and extract specific data.

html_content = response.content
soup = BeautifulSoup(html_content,"html.parser") #html.parser tells BeautifulSoup to use built-in HTML parser!

In [None]:
# Test print title tag of the page
print(soup.title)

In [None]:
# Test print title of page as string
print(soup.title.string)

In [None]:
# Test print sum of all links in the page
page_links = len(soup.find_all("a"))
print(f"There are {page_links} links in this page.")

In [None]:
# Test print text from the page
#print(soup.get_text())

In [None]:
# Now we will test find by DOM elements, such as tag - for imdb listicles we want h3 class
soup.find_all("h3")

In [None]:
titles = soup.find_all("h3")

In [None]:
# Print the top 20 titles
for title in titles[:20]:
    print(title.text)

In [None]:
h3_titles = [title.get_text().strip() for title in soup.find_all('h3')]
limited_titles = h3_titles[:20]
print(limited_titles)

In [None]:
import pandas as pd

df = pd.DataFrame(limited_titles, columns=['Title'])
df.head()

***
Done testing.

#### Notes:

1. We can build a big loop to loop over each sites to get top 10 or 20. 
2. Afterwards, we will build a nested for loop, to go through each site, find all h3, strip the numbering (e.g. 1. 2.)
3. Then append them into an empty list with column name 'titles'.
4. After that, we can use value_counts() to count the number of time each title has been repeated.
5. The final dataframe should have a 'title' column and 'count' column.
6. Export the final dataframe as .csv

# Section 2: Actual Loop #
***

In [48]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# To disguise/by-pass 403 - block by IMDB
head = {"User-Agent":"Chrome"}
check = 0

# From IMDB: lists/lsXXXXXXXXX/
urls_list = ['https://www.imdb.com/list/ls026320360/?sort=user_rating%2Cdesc','https://www.imdb.com/list/ls041295777/?sort=user_rating%2Cdesc',
            'https://www.imdb.com/list/ls029540683/?sort=user_rating%2Cdesc','https://www.imdb.com/list/ls029528149/?sort=user_rating%2Cdesc',
            'https://www.imdb.com/list/ls044081393/?sort=user_rating%2Cdesc','https://www.imdb.com/list/ls095319760/?sort=user_rating%2Cdesc',
            'https://www.imdb.com/list/ls041593310/?sort=user_rating%2Cdesc','https://www.imdb.com/list/ls091650455/?sort=user_rating%2Cdesc',
            'https://www.imdb.com/list/ls093951797/?sort=user_rating%2Cdesc','https://www.imdb.com/list/ls086167623/?sort=user_rating%2Cdesc',
            'https://www.imdb.com/list/ls086805851/?sort=user_rating%2Cdesc','https://www.imdb.com/list/ls080705486/?sort=user_rating%2Cdesc',
            'https://www.imdb.com/list/ls084374702/?sort=user_rating%2Cdesc','https://www.imdb.com/list/ls501352531/?sort=user_rating%2Cdesc',
            'https://www.imdb.com/list/ls574741945/?sort=user_rating%2Cdesc','https://www.imdb.com/list/ls574098070/?sort=user_rating%2Cdesc',
            'https://www.imdb.com/list/ls093971121/?sort=user_rating%2Cdesc','https://www.imdb.com/list/ls063868333/?sort=user_rating%2Cdesc']

# Loop through the url list and check all return 200 ok
for url in urls_list:
    response = requests.get(url, headers=head)
    if response.status_code == 200:
        print("200, OK!")
        check = 1
    else:
        check = -1
        print(response)
            
if check == 1: 
    print("Done checking!")
else:
    print("Something went wrong!")

200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
200, OK!
Done checking!


In [49]:
# Using BeautifulSoup to parse the HTML code and extract specific data.

# Check if check = 1 (means all 200 ok)
if check == 1:
    all_h3_titles = []
    
    # For loop to go through list of urls
    for url in urls_list:
        response = requests.get(url, headers=head)
        html_content = response.content
        soup = BeautifulSoup(html_content,"html.parser")
        titles = soup.find_all("h3")
        
        # Nested for loop to print out top 20 titles of web page
        for title in titles[:20]:
            clean_title = title.text.strip() # remove 1. 2. 3. ...
            all_h3_titles.append(clean_title) # add each web page top 20 to the list

    df = pd.DataFrame(all_h3_titles, columns=['Title'])

In [50]:
df.head()

Unnamed: 0,Title
0,1. To All the Boys I've Loved Before
1,2. Kodachrome
2,3. A Futile and Stupid Gesture
3,4. Set It Up
4,5. When We First Met


In [51]:
# More on regex here: https://hostman.com/tutorials/how-to-delete-a-character-from-a-string-in-python/
df['Title'] = df['Title'].str.replace(r'\d+\.','',regex=True)
df.head()

Unnamed: 0,Title
0,To All the Boys I've Loved Before
1,Kodachrome
2,A Futile and Stupid Gesture
3,Set It Up
4,When We First Met


In [52]:
df.shape

(356, 1)

In [53]:
netflix_counts = df['Title'].value_counts().reset_index()
netflix_counts.head()

Unnamed: 0,index,Title
0,Bridgerton,7
1,Marriage Story,5
2,The Magicians,4
3,The Queen's Gambit,3
4,The Irishman,3


In [54]:
# Rename the columns
netflix_counts.columns = ['title','count']

In [55]:
netflix_counts.head()

Unnamed: 0,title,count
0,Bridgerton,7
1,Marriage Story,5
2,The Magicians,4
3,The Queen's Gambit,3
4,The Irishman,3


In [56]:
netflix_counts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   287 non-null    object
 1   count   287 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.6+ KB


In [58]:
netflix_counts.head(20)

Unnamed: 0,title,count
0,Bridgerton,7
1,Marriage Story,5
2,The Magicians,4
3,The Queen's Gambit,3
4,The Irishman,3
5,Dark,3
6,Black Mirror,3
7,Arcane,3
8,When They See Us,3
9,The Half of It,3


In [59]:
imbd_top_20 = netflix_counts[:20]

In [60]:
print(imbd_top_20)

                                      title  count
0                                Bridgerton      7
1                            Marriage Story      5
2                             The Magicians      4
3                        The Queen's Gambit      3
4                              The Irishman      3
5                                      Dark      3
6                              Black Mirror      3
7                                    Arcane      3
8                          When They See Us      3
9                            The Half of It      3
10                             Private Life      2
11                      All Day and a Night      2
12                            The Lovebirds      2
13                               Lost Girls      2
14   To All the Boys: P.S. I Still Love You      2
15                     Spenser Confidential      2
16                                    Ozark      2
17                        A Fall from Grace      2
18                            S

In [61]:
# convert to .csv output
imbd_top_20.to_csv('IMDB_ratings/imdb_top.csv',index=False)

***

## Fun Stuffs

Was trying to make a notification sound when the loop for status == 200 codes is done, but somehow the code is a little jiffy, so I did not want to include it. 

In [20]:
# FROM https://github.com/TaylorSMarks/playsound
from playsound import playsound

In [24]:
# from https://pixabay.com/sound-effects/search/notification/?order=trending
#playsound("./level-up.mp3")

## References:

1. DataCamp courses: intro + intermediate importing Data in Python & streamlined Data Ingestion
2. BeautifulSoup tutorial: https://www.scrapingbee.com/blog/python-web-scraping-beautiful-soup/

# END OF NOTEBOOK #
***