In [None]:
import os
import requests
from bs4 import BeautifulSoup
from selenium.webdriver import *
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle

In [None]:
ID = "id"
NAME = "name"
XPATH = "xpath"
LINK_TEXT = "link text"
PARTIAL_LINK_TEXT = "partial link text"
TAG_NAME = "tag name"
CLASS_NAME = "class name"
CSS_SELECTOR = "css selector"

In [None]:
def create_browser():
    """
    Function creating browser object.
    """
    # add options to disable pop-ups
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument('disable-notifications')
    # return browser object
    chrome = Chrome(options=chrome_options)
    return chrome

In [None]:
def get_user_page_first_time(chrome, user):
    """
    Function loading the basic page.
    """
    # load page
    chrome.get(f"https://www.filmweb.pl/user/{user}/films")
    # agree to terms of use
    chrome.find_element(By.ID, 'didomi-notice-agree-button').click()
    # wait until the advertisement ends
    time.sleep(35)
    chrome.maximize_window()
    time.sleep(10)

In [None]:
def get_user_page(chrome, user):
    """
    Function to get the user page.
    """
    # load page
    chrome.get(f"https://www.filmweb.pl/user/{user}/films")
    time.sleep(10)

In [None]:
def login_to_facebook(chrome, email, user_password):
    """
    Function logging to Facebook.
    """
    for i in range(20):
        chrome.execute_script(f"window.scrollBy(0,{i})","")
        
    chrome.find_element(By.CLASS_NAME, 'facebookLoginButton__button').click()
    
    # get window to scroll that showed up
    scrollable = chrome.find_element(By.CLASS_NAME, '_4t2a')
    chrome.execute_script("return arguments[0].scrollIntoView(true);", scrollable)
    chrome.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # click whatever
    button = chrome.find_element(By.CLASS_NAME, '_9xo5')
    button.click()
    
    # get all elements needed to log in
    login = chrome.find_element(By.ID, 'email')
    password = chrome.find_element(By.ID, 'pass')
    submit = chrome.find_element(By.ID, 'loginbutton')
    
    # enter credentials
    # login.click()
    login.send_keys(email)
    # password.click()
    password.send_keys(user_password)
    
    # click submit
    submit.click()
    
    time.sleep(10)

In [None]:
def get_genres_and_actors(chrome):
    div = chrome.find_elements(By.CLASS_NAME, 'preview__details')
    all_films = []
    for div_element in div:
        per_film = div_element.find_elements(By.CSS_SELECTOR, 'a')
        one_film_list = []
        for att in per_film:
            res = att.get_attribute('textContent')
            one_film_list.append(res)
        formatted = (tuple(one_film_list[:-2]), tuple(one_film_list[-2:]))
        all_films.append(formatted)
    return all_films

In [None]:
def get_ratings(chrome, user):
    """
    Function returning ratings from the first page.
    """
    
    # scroll down to load ratings
    for i in range(1000):
        chrome.execute_script(f"window.scrollBy(0,{i})","")
    
    # get ratings
    ratings = chrome.find_elements(By.CLASS_NAME, 'userRate__rate')
    ratings_list = [r.get_attribute('textContent') for r in ratings]

    # get film names
    film_names = chrome.find_elements(By.CLASS_NAME, 'preview__link')
    film_names_list = [f.get_attribute('textContent') for f in film_names]
    
    # get year of production
    preview_years = chrome.find_elements(By.CLASS_NAME, 'preview__year')
    preview_years_list = [p.get_attribute('textContent') for p in preview_years]
    
    # get genres and actors
    genres_actors_list = get_genres_and_actors(chrome)
    
    # get community ratings
    community_ratings = chrome.find_elements(By.CLASS_NAME, 'communityRatings__value')
    community_ratings_list = [r.get_attribute('textContent').strip() for r in community_ratings]
    community_ratings_list = community_ratings_list[::2]
    
    # number of ratings
    community_ratings_number = chrome.find_elements(By.CLASS_NAME, 'communityRatings__description')
    ratings_number_list = [r.get_attribute('textContent').strip() for r in community_ratings_number]
    ratings_number_list = [i[:i.find('ocen')].replace(' ', '') for i in ratings_number_list]
    ratings_number_list = list(filter(('chcezobaczy').__ne__, ratings_number_list))
    
    # add user name
    user_name_list = len(ratings) * [user]
    
    # list of tuples
    zipped_values = list(zip(user_name_list,
                             film_names_list, 
                             preview_years_list,
                             genres_actors_list,
                             community_ratings_list,
                             ratings_number_list,
                             ratings_list))
    
    
    
    return zipped_values
    

In [None]:
def get_all_user_ratings(chrome, user, folder=None):
    """
    Function returning all ratings of given user.
    The results are written to pickle files.
    """
    # get ratings from the first page
    first = get_ratings(chrome, user)
    print(first)
    
    # prepare placeholders for ratings
    old = []
    new = first
    
    # starting page number
    page = 2
    
    # iterate as long as there are films ratings on the next pages
    while (len(new) > 0) and (old != new):
        print(page)
        chrome.get(f"https://www.filmweb.pl/user/{user}/films?page={page}")
        x = get_ratings(chrome, user)
        print(x)
        page += 1
        old, new = old + new, x
        
    if len(old) > 0:
        if folder is None:
            with open(f"{user}_full_data.pickle", 'wb') as handle:
                pickle.dump(old, handle, protocol=pickle.HIGHEST_PROTOCOL)
        else:
            if not folder in os.listdir():
                os.mkdir(folder)
            with open(os.sep.join([folder, f"{user}_full_data.pickle"]), 'wb') as handle:
                pickle.dump(old, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
def get_user_friends(chrome, user):
    """
    Function returning the list of user friends.
    """
    chrome.get(f"https://www.filmweb.pl/user/{user}/friends")
    friends = chrome.find_elements(By.CLASS_NAME, 'user__nick')
    friends_list = [f.get_attribute('textContent').strip() for f in friends]
    
    return friends_list

In [None]:
def get_ratings_for_all_friends(chrome, user, folder=None):
    """
    Summary function: get all ratings for all friends.
    """
    friends_list = get_user_friends(chrome, user)
    for user in friends_list:
        print(f"Displaying results for user {user}")
        chrome.get(f"https://www.filmweb.pl/user/{user}/films")
        get_all_user_ratings(chrome, user, folder)

# SCRAPPING YOURSELF AND YOUR FRIENDS

In [None]:
# EXAMPLE USE

# 1. create browser object.
chrome = create_browser()

# 2. load the first page, accept usage terms etc.
get_user_page_first_time(chrome, "nerwusxd")

# 3. login to Facebook. In case it fails, just do it by hand in this special browser window.
login_to_facebook(chrome, "nerwusxd@gmail.com", "c4c1416y5")

# 4. congrats! you are ready to get the records
get_ratings_for_all_friends(chrome, "nerwusxd", "data")

# 5. if something fails in between, you can get records just for one user
#get_all_user_ratings(chrome, user)

# SCRAPPING TOP 500 USERS

In [None]:
def get_most_active_users_one_page(chrome, page):
    if page == 1:
        chrome.get('https://www.filmweb.pl/users/ranking/usersMonthly')
    else:
        chrome.get(f"https://www.filmweb.pl/users/ranking/usersMonthly?page={page}")
    friends = chrome.find_elements(By.CLASS_NAME, 'rankingList__userName')
    friends_list = [f.get_attribute('textContent').strip() for f in friends]
    return friends_list

In [None]:
def get_500_most_active_users(chrome):
    users_list = []
    for page_num in range(10):
        users_per_page = get_most_active_users_one_page(chrome, page_num + 1)
        users_list = users_list + users_per_page
    return users_list

In [None]:
active_500_users = get_500_most_active_users(chrome)

for user in active_500_users:
    get_user_page(chrome, user)
    print(f"Displaying results for user {user}")
    ratings = get_ratings(chrome, user)
    with open(f"{user}_full_data.pickle", 'wb') as handle:
        pickle.dump(ratings, handle, protocol=pickle.HIGHEST_PROTOCOL)