In [25]:
# Import packages
import re
import time
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from datetime import datetime
from bs4 import BeautifulSoup
from sqlalchemy import create_engine

In [62]:
class EtlSteam:

    def __init__(self, url: str):
        self.url = url
        self.html_content = str()
        self.appIDs = list()
        self.data_info = pd.DataFrame()
        self.data_links = pd.DataFrame()
        self.data_prices = pd.DataFrame()
        self.data_reviews = pd.DataFrame()
        self.total_counts = str()

    def get_summary_data(self) -> None:
        """
        Get summary data by scrap all search game steam list

        Args:
            url (str): url of api search game steam list
            start (int): number of start search steam list
            interval (int): range of return json of the game steam list
        
        Return:
            Processing....
        """
        response = requests.get(self.url).json()
        self.html_content = BeautifulSoup(response['results_html'],'html.parser')
        self.data_info, self.data_links = self.get_info()
        self.data_prices = self.get_prices()
        self.data_reviews = self.get_reviews()
        self.total_counts = response["total_count"]
        return None


    def verify_KeyError(self, tag):
        appIDs, tagIDs, links = '','',''
        try:
            appIDs = tag.attrs['data-ds-appid']
        except KeyError:
            appIDs = None
        try:
            tagIDs = tag.attrs['data-ds-tagids']
        except KeyError:
            tagIDs = None
        try:
            links = tag.attrs['href']
        except KeyError:
            links = None
        return appIDs, tagIDs, links


    def get_info(self) -> tuple:
        """
        Get info: appID, Title, Tag_id and links.

        Returns: 
            tuple: dataframe contain appID,Title,Tag_id , dataframe contain appID and links. 
        """
        tagIDs, titles, links = [],[],[]
        content = self.html_content  
        for tag in content.find_all('a', {'class':'search_result_row ds_collapse_flag'}):
                self.appIDs.append(self.verify_KeyError(tag)[0])
                tagIDs.append(self.verify_KeyError(tag)[1])
                links.append(self.verify_KeyError(tag)[2])
                # self.appIDs.append(tag.attrs['data-ds-appid'])
                # tagIDs.append(tag.attrs['data-ds-tagids'])
                # links.append(tag.attrs['href'])
        for tag in content.find_all('span', {'class':'title'}):
            titles.append(tag.get_text())           
        data_info = pd.DataFrame({
            'id_steam': self.appIDs,
            'title':titles,
            'tagid_steam':tagIDs
            })
        data_links = pd.DataFrame({
            'id_steam': self.appIDs,
            'links': links 
        })
        return data_info, data_links
    
    
    def get_prices(self) -> pd.DataFrame:
        """
        Get prices: Release_data, prices, discount, dateview

        Return:
            pandas.DataFrame() : dataframe contain Release_data, prices, discount and dataview
        """
        releases_data = []
        prices = []
        discounts = []
        content = self.html_content
        for tag in content.find_all('div', {'class':'col search_released responsive_secondrow'}):
            releases_data.append(tag.get_text())
        for tag in content.find_all('div', {'class':'col search_price_discount_combined responsive_secondrow'}):
            try:
                free1 = tag.find('div', {'class':'col search_price responsive_secondrow'}).get_text()
                free = free1.replace(' ','').replace('\r\n','')
            except AttributeError:
                free = None
            price = tag.attrs['data-price-final']
            if free == 'FreetoPlay':
                prices.append('freetoplay')
            else:
                price = list(price)
                price.insert(-2,'.')
                prices.append(''.join(price))
            try:
                discounts.append(tag.find('span').get_text())
            except AttributeError:
                discounts.append(None)
        data_prices = pd.DataFrame({
            'id_steam': self.appIDs,
            'release_date': releases_data,
            'price_real': prices,
            'discount': discounts 
            })
        data_prices['data_view'] = datetime.now().strftime('%d-%m-%Y')
        return data_prices


    def get_reviews(self) -> pd.DataFrame:
        """
        Get Total review and the % of positive review

        Return:
            pandas.DataFrame() : dataframe with the ID, Total Review, %positive review
        """
        reviews = []
        positive_rev = []
        content = self.html_content
        for tag in content.find_all('a', {'class': 'search_result_row ds_collapse_flag'}):
            try:
                tag = tag.find('span', {'class': 'search_review_summary positive'})
                tag = tag.attrs['data-tooltip-html']
                pattern_total_rev = r'the\s+(\d[\d,]*)\s+user'
                match_total_rev = re.search(pattern_total_rev, tag)
                if match_total_rev:
                    number_total = match_total_rev.group(1).replace(',', '')
                    reviews.append(number_total)
                pattern_percent_rev = r'(\d[\d,]*)%'
                match_percent_rev = re.search(pattern_percent_rev, tag)
                if match_percent_rev:
                    number_percent = match_percent_rev.group(1).replace(',', '')
                    positive_rev.append(number_percent)
            except (AttributeError, TypeError):
                reviews.append(None)
                positive_rev.append(None)
        data_reviews = pd.DataFrame({
            'id_steam': self.appIDs,
            'total_reviews': reviews,
            'percent_positive_reviews': positive_rev
            })
        return data_reviews

In [35]:
def get_group_data(start:int) -> str:
    url = "https://store.steampowered.com/search/results/?query&start="+str(start)+\
       "&count=50&dynamic_data=&sort_by=_ASC&supportedlang=english&snr=1_7_7_230_7&infinite=1"
    etl_steam = EtlSteam(url)
    etl_steam.get_summary_data()
    return f'Group:{start} -- ok!'

## etl_steam

In [36]:
# Connect to Postgres DB
# Criando Engine SQLalchemy 
database = 'etl-steam'
user = 'docker' 
password = 'docker'
host = 'localhost'
engine = create_engine('postgresql+psycopg2://'+user+':'+password+'@'+host+'/'+database)

In [63]:
def get_group_data(start:int) -> str:
    start = start*50
    url = "https://store.steampowered.com/search/results/?query&start="+str(start)+\
       "&count=50&dynamic_data=&sort_by=_ASC&supportedlang=english&snr=1_7_7_230_7&infinite=1"
    return url


def save_data_DB(obj: object, first_commit: bool) -> str:
    if first_commit == True:
        obj.data_info.to_sql('info', engine, if_exists = 'replace', index = False)
        obj.data_prices.to_sql('prices', engine, if_exists = 'replace', index = False)
        obj.data_links.to_sql('links', engine, if_exists='replace', index=False )
        obj.data_reviews.to_sql('reviews', engine, if_exists='replace', index=False)
    else:
        obj.data_info.to_sql('info', engine, if_exists = 'append', index = False)
        obj.data_prices.to_sql('prices', engine, if_exists = 'append', index = False)
        obj.data_links.to_sql('links', engine, if_exists='append', index=False )
        obj.data_reviews.to_sql('reviews', engine, if_exists='append', index=False)
    return 'Saved'


def scroll_all_games(total_groups):
    for group in tqdm(range(total_groups)):
        etl_steam = EtlSteam(get_group_data(group))
        etl_steam.get_summary_data()
        if group == 0: 
            save_data_DB(etl_steam, True)
        else:
            save_data_DB(etl_steam,False)
        time.sleep(np.random.random())
    return 'Finalizado'
    

In [64]:
obj = EtlSteam(get_group_data(0))
obj.get_summary_data()
obj.total_counts

141108

In [65]:
total_groups = obj.total_counts//50
scroll_all_games(total_groups)

100%|██████████| 2822/2822 [1:02:59<00:00,  1.34s/it]


'Finalizado'