# Web Scraping Project

In [2]:
#importing required libraries

import requests
from bs4 import BeautifulSoup
import os
import re
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait

In [3]:
# importing html content from the url through request library
url = 'https://store.steampowered.com/games/'
response = requests.get(url)

In [4]:
# checking the response code
response

<Response [200]>

In [21]:
#creating a folder
folder = 'steam_games'
if not os.path.exists(folder):
    os.mkdir(folder)

In [24]:
#writing contents of the URL in a file
with open (os.path.join(folder, r'steam_games.html'), mode='wb') as f:
    f.write(response.content)

In [25]:
os.listdir(folder)

['steam_games.html']

In [29]:
# listing the titles and links for the top 5 games
with open(os.path.join(folder,r'steam_games.html'), encoding='UTF-8') as f:
    soup = BeautifulSoup(f, 'lxml')
    a = soup.find('div',id='NewReleasesRows').find_all('a', limit=5, recursive=False)
df = []
for each in a:
    name = each.find('div', class_='tab_item_name').contents[0]
    link = each['href']
    df.append({'title': name,
                    'link' : link
                   })    
df

[{'title': 'Here Comes Niko!',
  'link': 'https://store.steampowered.com/app/925950/Here_Comes_Niko/?snr=1_1452_4__103'},
 {'title': 'The Ramp',
  'link': 'https://store.steampowered.com/app/1506510/The_Ramp/?snr=1_1452_4__103'},
 {'title': "Hunter's Arena: Legends",
  'link': 'https://store.steampowered.com/app/1061100/Hunters_Arena_Legends/?snr=1_1452_4__103'},
 {'title': 'Olaguna Chronicles',
  'link': 'https://store.steampowered.com/app/985650/Olaguna_Chronicles/?snr=1_1452_4__103'},
 {'title': 'Who Stole My Beard?',
  'link': 'https://store.steampowered.com/app/1141270/Who_Stole_My_Beard/?snr=1_1452_4__103'}]

In [32]:
#getting number of positive reviews, developer, publisher name, and system requirements for each game
data = []
for each in df:
    response = requests.get(each['link'])
    soup = BeautifulSoup(response.content, 'lxml')
    
    #count of positive reviews
    positive_reviews = ''
    try:
        positive_reviews = soup.find_all('div', class_='summary column')[0].find('span', class_='responsive_hidden').contents[0].strip()[1:-1]
    except:
        pass
    
    #name of the developer
    developer = ''
    try:
        developer = soup.find_all('div', class_='dev_row')[0].find('a').contents[0]
    except:
        pass
    
    #name of the publisher
    publisher = ''
    try:
        publisher = soup.find_all('div', class_='dev_row')[1].find('a').contents[0]
    except:
        pass
   
    #minimum system requirements
    system_requirements = {}
    count = 1
    for li in soup.find('div', {"class":['game_area_sys_req_full', 'game_area_sys_req_leftCol']}).find('ul', class_='bb_ul').find_all('li'):
        try:
            key, value = li.text.split(':')
        except:
            key = 'other_system_requirements'+str(count)
            value = li.text
        system_requirements[key] = value
        
    #putting all together
    summary = {
        'title' : each['title'],
        'review_count': positive_reviews,
        'developer_name': developer,
        'publisher_name': publisher,
        'link': each['link']}
   
    appended = {**summary, **system_requirements}
   
    data.append(appended)

In [43]:
chrome_options = Options()
chrome_options.add_argument("--headless")

#getting reviews for each game
for x in data:
    #chrome driver path
    driver = webdriver.Chrome("C:\\Users\\Rohan\\Downloads\\chromedriver_win32 (1)\\chromedriver.exe", options=chrome_options)
    
    driver.get(each['link'])                        #importing the contents    
    elements = driver.find_element_by_class_name("review_box")    #getting the elements 
    action = ActionChains(driver)                   #chrome action
    action.move_to_element(elements).perform()       #executing the jquery script
    time.sleep(3)                                   #waiting for 3 reviews to get loaded
    
    reviews = driver.find_elements_by_css_selector(".review_box .content")    #getting all reviews

    top_reviews = ''
    for review in reviews:
        try:
            top_reviews = top_reviews + '\n' + review.text
        except:
            pass

    driver.quit()    
    x['reviews'] = top_reviews         #adding reviews to each game 

In [47]:
#creating a dataframe
df = pd.DataFrame(data)
df

Unnamed: 0,title,review_count,developer_name,publisher_name,link,other_system_requirements1,OS,Processor,Memory,Storage,reviews,Graphics,Sound Card,DirectX,Network,Additional Notes
0,Here Comes Niko!,97,Frog Vibes,Gears for Breakfast,https://store.steampowered.com/app/925950/Here...,Requires a 64-bit processor and operating system,Windows 10,3.0 GHz processor,4 GB RAM,2 GB available space,\nFun game with great artwork and bubbly music...,,,,,
1,The Ramp,144,Hyperparadise,Hyperparadise,https://store.steampowered.com/app/1506510/The...,Requires a 64-bit processor and operating system,Windows 10,2.7 GHz Duo Core,4 GB RAM,900 MB available space,\nFun game with great artwork and bubbly music...,2 GB VRAM,You don't really need one. Just humming your ...,,,
2,Hunter's Arena: Legends,44,Mantisco,Mantisco,https://store.steampowered.com/app/1061100/Hun...,,"64-bit Windows 7, Windows 8.1, Windows 10",Intel Core i5-4430 / AMD FX-6300,8 GB RAM,30 GB available space,\nFun game with great artwork and bubbly music...,NVIDIA GeForce GTX 770 / AMD Radeon R7 370 2GB,,Version 11,Broadband Internet connection,
3,Olaguna Chronicles,43,SELeft Studio,Lycian Studio,https://store.steampowered.com/app/985650/Olag...,,WIN7/WIN10,Pentium Dual Core级以上,2 GB RAM,700 MB available space,\nFun game with great artwork and bubbly music...,Geforce FX5600级/ATI Radeon9600以上 (支持Shader 2....,DirectX 可互换声卡,Version 9.0c,,WINDOWS XP以下不能运行（含XP)
4,Who Stole My Beard?,12,Cleardot Games,Cleardot Games,https://store.steampowered.com/app/1141270/Who...,,7,Intel 3 or higher,1024 MB RAM,1024 MB available space,\nFun game with great artwork and bubbly music...,Basic (2GB or higher),16-bit,,,


In [48]:
# exporting to a csv file
df.to_csv(os.path.join(folder, r'steam_games_details.csv'), index=False)