In [None]:
# Imports
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import sleep
import random
import boto3
import json

In [None]:
# Set pandas options
pd.set_option('max_colwidth', 200)

In [None]:
# Create a dataframe for the movie data
df_cols = ['Title', 'YTS URL', 'YTS Rating', 'Thumbnail Image URL', 'Movie Image URL', 'Categories',
           'Description', 'Likes', 'IMDb Rating', 'Best Rating', 'Rating Count', 'IMDb URL', 'Downloads']
movie_data = pd.DataFrame(columns=df_cols)

In [None]:
# Extract the data for all the movies on the website
# There are 434 pages of movies

# The browse page URL
url_to_scrape = "https://yts.am/browse-movies"

page = 1
total_pages = 434
total_movies = 8671

processed_count = 0

# Track pages that result in errors
urls_not_scraped = []

for i in range(1,total_pages+1):
    # Each subsequent url is structured like this: https://yts.am/browse-movies?page=2
    if i > 1:
        url_to_scrape = "https://yts.am/browse-movies?page={}".format(i)
        
    # Retrieve the page
    page = requests.get(url_to_scrape)
    
    # Create the soup
    soup = BeautifulSoup(page.content, 'lxml')

    for movie in soup('div', class_='browse-movie-wrap col-xs-10 col-sm-4 col-md-5 col-lg-4'):
        try:
            # Extract the movie info
            movie_title = movie.find_all('a')[1].text
            yts_url = movie.find_all('a')[1].get('href')
            yts_rating = movie.h4.text.split(" / ")[0]
            movie_thumbnail_url = movie.img.get('src')
            movie_year = movie.div(class_="browse-movie-year")[0].text

            # Get the movie detail page & create the soup
            movie_details_soup = requests.get(yts_url)
            md_soup = BeautifulSoup(movie_details_soup.content, 'lxml')

            # Extract the movie data from the details page
            movie_image_url = md_soup.find("img", {"class": "img-responsive"}).get('src')
            movie_categories = md_soup.find("div", {"id": "movie-info"}).find_all('h2')[1].text.split(" / ")
            movie_description = md_soup('p', class_='hidden-sm hidden-md hidden-lg')[0].text.strip()
            movie_likes = md_soup.find("span", {"id": "movie-likes"}).text
            imdb_rating = md_soup.find("span", {"itemprop": "ratingValue"}).text
            best_rating = md_soup.find("span", {"itemprop": "bestRating"}).text
            rating_count = md_soup.find("span", {"itemprop": "ratingCount"}).text
            imdb_url = md_soup.find("a", {"title": "IMDb Rating"}).get('href')
            downloads = md_soup.find_all("em")[2].text.split(" ")[1]

            # Add to the movie data dataframe
            movie_data.loc[len(movie_data)] = [movie_title, yts_url, yts_rating, movie_thumbnail_url,
                                               movie_image_url, movie_categories, movie_description, movie_likes,
                                               imdb_rating, best_rating, rating_count, imdb_url, downloads]
        except Exception as e:
            urls_not_scraped.append(yts_url)
            continue

        processed_count += 1
        
        # Show our progress
        print('Processing page {} of 434'.format(i),
              '{}% Complete'.format(round(processed_count/total_movies*100,2)),
              '{} errors'.format(len(urls_not_scraped)), end="\r")

        # Sleep a bit before getting the next page
        sleep(random.uniform(2.1,3.9))

print("Scrape complete!")

In [None]:
# Save the data to a CSV File
movie_data.to_csv("yts_data.csv", encoding='utf-8')

In [None]:
# Show the URLs we didn't scrape
urls_not_scraped

In [None]:
# Make a copy of the dataframe to work with
md2 = movie_data.copy(deep=True)

In [None]:
# How many movies did we get?
len(md2)

## Amazon Comprehend

In [None]:
comprehend = boto3.client(service_name='comprehend', region_name='us-east-1')

In [None]:
# Dominant Language
def get_language(text):
    if text != "":
        language_resp = comprehend.detect_dominant_language(Text = text)
        languages = language_resp['Languages']
        language_count = len(languages)
        return languages, language_count
    else:
        return None

In [None]:
# Named Entities
def get_named_entities(text):
    if text != "":
        entity_resp = comprehend.detect_entities(Text=text, LanguageCode='en')
        entities = entity_resp['Entities']
        entity_count = len(entities)
        return entities, entity_count
    else:
        return None

In [None]:
# Key Phrases
def get_key_phrases(text):
    if text != "":
        kp_resp = comprehend.detect_key_phrases(Text=text, LanguageCode='en')
        key_phrases = kp_resp['KeyPhrases']
        key_phrase_count = len(key_phrases)
        return key_phrases, key_phrase_count
    else:
        return None

In [None]:
# Sentiment
def get_sentiment(text):
    if text != "":
        sentiment_resp = comprehend.detect_sentiment(Text=text, LanguageCode='en')
        sentiment = sentiment_resp['Sentiment']
        sentiment_score_mixed = sentiment_resp['SentimentScore']['Mixed']
        sentiment_score_negative = sentiment_resp['SentimentScore']['Negative']
        sentiment_score_neutral = sentiment_resp['SentimentScore']['Neutral']
        sentiment_score_positive = sentiment_resp['SentimentScore']['Positive']
        return sentiment, sentiment_score_mixed, sentiment_score_negative, sentiment_score_neutral, sentiment_score_positive
    else:
        return None

In [None]:
# Get the language
md2['Language'] = md2['Description'].map(get_language)
md2.to_csv("yts_data_with_language.csv", encoding='utf-8')

In [None]:
# Get the named entities
md2['Entities'] = md2['Description'].map(get_named_entities)
md2.to_csv("yts_data_with_entities.csv", encoding='utf-8')

In [None]:
# Get the key phrases
md2['Key Phrases'] = md2['Description'].map(get_key_phrases)
md2.to_csv("yts_data_with_key_phrases.csv", encoding='utf-8')

In [None]:
# Get the sentiment
md2['Sentiment'] = md2['Description'].map(get_sentiment)
md2.to_csv("yts_data_fully_enriched.csv", encoding='utf-8')

In [None]:
record = json.loads(md2.head(1).to_json())
print(json.dumps(record, indent=2))

In [None]:
# Parse the language data and append to the dataframe
lang_codes = list()
lang_scores = list()

language_data = md2['Language'].tolist()
for item in language_data:
    try:
        lang_codes.append(item[0][0]['LanguageCode'])
        lang_scores.append(item[0][0]['Score'])
    except:
        lang_codes.append(None)
        lang_scores.append(None)
    
md2['Language Code'] = lang_codes
md2['Language Score'] = lang_scores

In [None]:
# Parse the sentiment data

sentiments = list()
mixed_scores = list()
negative_scores = list()
neutral_scores = list()
positive_scores = list()

sentiment_data = md2['Sentiment'].tolist()
for item in sentiment_data:
    try:
        sentiments.append(item[0])
        mixed_scores.append(item[1])
        negative_scores.append(item[2])
        neutral_scores.append(item[3])
        positive_scores.append(item[4])
    except:
        sentiments.append(None)
        mixed_scores.append(None)
        negative_scores.append(None)
        neutral_scores.append(None)
        positive_scores.append(None)
        
md2['Sentiment 2'] = sentiments
md2['Sentiment Mixed Score'] = mixed_scores
md2['Sentiment Negative Score'] = negative_scores
md2['Sentiment Neutral Score'] = neutral_scores
md2['Sentiment Positive Score'] = positive_scores

del md2['Sentiment']
md2.rename(columns={'Sentiment 2': 'Sentiment'}, inplace=True)

In [None]:
record = json.loads(md2.head(1).to_json())
print(json.dumps(record, indent=2))

## Convert the columns to the correct data types

In [None]:
md2.dtypes

In [None]:
# Clean up the Downloads column for float conversion
def remove_comma(s):
    try:
        s = s.replace(",", "")
    except:
        pass
    return s

In [None]:
# Convert to floats
cols_to_convert_to_float = ['YTS Rating', 'Likes', 'IMDb Rating', 'Best Rating', 'Rating Count', 'Downloads']

for col in cols_to_convert_to_float:
    md2[col] = md2[col].apply(remove_comma)
    md2[col] = md2[col].astype('float64', inplace=True)

In [None]:
md2.dtypes

## Save the Final Result

In [None]:
# Save the final data file
md2.to_csv("yts_data_fully_enriched_clean.csv", index=False, encoding="utf-8")