# Linkedin Learning: Python Automation 

#### 1. [Automating Data Manipulation and Validation](#auto_data)
#### 2. [Web Scrapping with BeautifulSoup](#web_scrap)
#### 3. [Web Browsing with Selenium](#Selenium)
#### 4. [API](#API)

## 1. Automating Data Manipulation and Validation
<a id="auto_data"></a>

In [None]:
# Read File
input_file = open("inputFile.txt", "r")
for line in input_file:
    line_split = line.split()
    if line_split[2] == "P":
        print(line)
input_file.close()

Mary 25 P

John 32 P

Hailey 26 P

Iris 23 P

Jacob 29 P

Jamie 26 P

Chloe 21 P

Natalie 29 P

David 23 P

Mario 51 P

Josh 39 P

Kayla 28 P

Hunter 61 P

Erica 16 P

Kyle 19 P

Rosanna 45 P

Joy 28 P

Jim 67 P

Sansa 28 P

Juan 73 P

Colin 59 P

Kate 58 P

Jade 26 P

River 29 P

Chris 31 P



In [3]:
# Write File

# Read file for data extraction
input_file = open("inputFile.txt", "r")

# Read file for entering data
pass_file = open("passFile.txt", "w")
fail_file = open("failFile.txt", "w")

# Data extraction using for loop
for line in input_file:
    line_split = line.split()
    if line_split[2] == "P":
        pass_file.write(line)
    else:
        fail_file.write(line)

# Close file
input_file.close()
pass_file.close()
fail_file.close()


In [4]:
# Command line interface
import subprocess
import sys

for i in range(5):
    subprocess.check_call([sys.executable, "example.py"])


In [5]:
# file organizer
import os
from pathlib import Path

# define the categories and file types
SUBDIRECTORIES = {
    "DOCUMENTS": ['.pdf','.rtf','.txt'],
    "AUDIOS": ['.m4a','.m4b','.mp3'],
    "VIDEOS": ['.mov','.avi','.mp4'],
    "IMAGES": ['.jpg','.jpeg','.png']
}

# define a function return the category based on the file types
def pickDirectory(filetype):
    for category, suffixes in SUBDIRECTORIES.items():
        if filetype in suffixes:
            return category
        return "MISC"
print(pickDirectory(".pdf"))

# define a function to organize the file in the corresponding filetype
def organizeDirectory():
    for item in os.scandir():
        if item.is_dir():
            continue
        filepath = Path(item)
        filetype = filepath.suffix.lower()
        directory = pickDirectory(filetype)
        directorypath = Path(directory)
        if directorypath.is_dir() != True:
            directorypath.mkdir()
        filepath.rename(directorypath.joinpath(filepath))

organizeDirectory()


DOCUMENTS


In [6]:
# Parse data

# text
file_path = "DOCUMENTS/groceries.txt"

with open(file_path, "r") as file:
    data = file.read()

# print("data:", data)
data_split = data.split()
print(data_split)

# CSV
import csv

filepath = "MISC/groceries.csv"

with open(filepath, "r") as file:
    csv_reader = csv.reader(file)
    headers = next(csv_reader)
    for row in csv_reader:
        row[1] = int(row[1]) #make column 2 integer 
        print(row)

# JSON
import json

file_path = "MISC/groceries.json"

with open(file_path, "r") as file:
    data = file.read()

JSON_data = json.loads(data)
print("ginger Q:", JSON_data["ginger"])

# XML
import xml.etree.ElementTree as ET

filepath = "MISC/groceries.xml"

tree = ET.parse(filepath)
root = tree.getroot()

for item in root.findall("grocery_item"):
    name = item.find("name").text
    price = item.find("price").text
    #print(name, price)

# select item whose price is above 6
item_above_six = []

for item in root.findall("grocery_item"):
    name = item.find("name").text
    price = item.find("price").text
    if float(price) > 6.00:
        item_above_six.append(name)
    
print(item_above_six)
    

['apples,', 'bananas,', 'carrots,', 'durians,', 'eggplants,', 'ginger,', 'hazelnuts']
['apples', 2]
['bananas', 6]
['carrots', 4]
['durians', 3]
['eggplants', 5]
['ginger', 1]
['hazelnuts', 8]
ginger Q: 1
['Avocados (per bag)', 'Coffee (per pound)', 'Almonds (per pound)', 'Avocado Oil', 'Truffle Oil', 'Saffron (per gram)']


In [7]:
# Extract data with regular expression
import re

example = "The number is 123-456-7890."

phoneNumRegex = re.compile(r"\d\d\d-\d\d\d-\d\d\d\d")
result = phoneNumRegex.search(example)

if result:
    print("Phone found:", result.group())
    print("Area code:", result.group()[0:3])
 

Phone found: 123-456-7890
Area code: 123


### Install a package 

In [8]:
# Install package
import sys
!{sys.executable} -m pip install pyinputplus

Collecting pyinputplus
  Using cached PyInputPlus-0.2.12-py3-none-any.whl
Collecting pysimplevalidate>=0.2.7 (from pyinputplus)
  Using cached PySimpleValidate-0.2.12-py3-none-any.whl
Collecting stdiomask>=0.0.3 (from pyinputplus)
  Using cached stdiomask-0.0.6-py3-none-any.whl
Installing collected packages: stdiomask, pysimplevalidate, pyinputplus
Successfully installed pyinputplus-0.2.12 pysimplevalidate-0.2.12 stdiomask-0.0.6



[notice] A new release of pip is available: 23.3.1 -> 25.0.1
[notice] To update, run: C:\Users\desti\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [9]:
# Input validation
import pyinputplus as pyip

print("\nNumerical example")
# Numerical 
result = pyip.inputInt("Enter your lucky number from 0 to 10", min = 0, max = 10)
print("\nYour lucky number is ", result)

print("\nMulti. Choice example")
# Letter only
result = pyip.inputMenu(["Ronaldo", "LBJ", "Elon"], lettered=1, numbered=0)
print("\nYour fav athlete is ", result)

print("\nEmail example")
#Email 
result = pyip.inputEmail("Enter your email:")
print("\nYour email is ", result)


Numerical example
Enter your lucky number from 0 to 10


Your lucky number is  1

Multi. Choice example
Please select one of the following:
A. Ronaldo
B. LBJ
C. Elon
'2' is not a valid choice.
Please select one of the following:
A. Ronaldo
B. LBJ
C. Elon

Your fav athlete is  Ronaldo

Email example
Enter your email:
Your email is  dsa@dd.cc


In [10]:
# Error handling

# Check for computing error
try:
    num = int(input("Enter a number: "))
    result = int(10/num)
    print("10 divided by the number you entered is ", result)
except ValueError:
    print("Plz enter a number")
except ZeroDivisionError:
    print("Plz dun enter 0")

# Check for logical error (no error message but incorrect result)
list = [1, 2, 3]
list.reverse()
print(list)
try:
    assert list[0] <= list[-1]
except AssertionError:
    print("error spotted")
    list.sort()
assert list[0] <= list[-1]
print("fine")

10 divided by the number you entered is  10
[3, 2, 1]
error spotted
fine


## 2. Web Scrapping using Beautifulsoup
<a id="web_scrap"></a>

In [11]:
# Install package
import sys
# !{sys.executable} -m pip install numpy

In [12]:
# Update package
# import sys
# !{sys.executable} -m pip uninstall numpy pandas -y
# !{sys.executable} -m pip install numpy pandas

In [13]:
# Web scrapping - 1 
# import from HTML

import requests
from bs4 import BeautifulSoup
import pandas

url="https://maec.hkust.edu.hk/curriculum-structure-2024-25"

# Get the html data from the url
response = requests.get(url, headers={"Accept": "text/html"})

# Parse the text data using beautifulsoup
parsed_response = BeautifulSoup(response.text, "html.parser")

# format the html text
formatted_response = parsed_response.prettify()

# write the response to a HTML file
with open("MAEC_Curriculum_scrapped.html", "w", encoding="utf-8") as file:
    file.write(str(formatted_response))

# print(parsed_response)

# Parse the specificed text 
table_subject_area = parsed_response.find_all("tr", class_="field_item")

# print(table_subject_area)


In [14]:
import sys
# !{sys.executable} -m pip install pandas

In [15]:
# Web scrapping-2
# Scrap specific data

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://maec.hkust.edu.hk/curriculum-structure-2024-25"

# Scrap data from the web
response = requests.get(url, headers={"Accept": "text/html"})
parsed_response = BeautifulSoup(response.text, "html.parser")

# search for the specific data
MAEC_curriculum_div = parsed_response.find_all("div", class_="field field--name-field-mod-long-fmt field--type-text-long field--label-visually_hidden mtpc-textarea mtpc-block-textare-block-textarea")

# list to store scrapped data
data = []

for div in MAEC_curriculum_div:
    table = div.find("table")
    if table:
        rows = table.find_all("tr")[1:] #skip the header
        for row in rows: 
            cols = row.find_all("td")
            if len(cols) >= 2:
                subject_area = cols[0].text.strip()
                no_of_coruses = cols[1].text.strip()
                data.append([subject_area, no_of_coruses])
        
        # Convert data into dataframe, then HTML
        df = pd.DataFrame(data, columns=["Subject Area", "No. of Courses"])
        df.to_html("MAEC_curriculum_subject_courses.html", index=False)
        break

In [16]:
# Web scrapping-3
# Scrap through multi page

import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

# for loop to iterate over the range of pages you want to scrape from
list_of_curriculum = ["","0","1","2024-25"]

range = len(list_of_curriculum) - 1

for academic_year in list_of_curriculum:
    url = "https://maec.hkust.edu.hk/curriculum-structure-" + academic_year
    
    # Scrap data
    response = requests.get(url, headers={"Accept": "html/text"})
    parsed_response = BeautifulSoup(response.text, "html.parser")

    #Search for the subject and number of courses from the text
    MAEC_parsed_div = parsed_response.find_all("div", class_="field field--name-field-mod-long-fmt field--type-text-long field--label-visually_hidden mtpc-textarea mtpc-block-textare-block-textarea")
    
    # create an empty list to store data
    data = []

    for div in MAEC_curriculum_div:
        table = div.find("table")
        if table:
            rows = table.find_all("tr")[1:] #remove the header
            for row in rows:
                cols = row.find_all("td")
                subject_area = cols[0].text.strip()
                num_of_course = cols[1].text.strip()
                data.append([subject_area, num_of_course])
            df = pd.DataFrame(data, columns=["Subjects of Area", "No. of Courses"])
            file_name = "MAEC_Curriculum_subject_courses-" + academic_year + ".html"
            df.to_html(file_name, index=False)
            break


## 3. Auto Web Browsing with Selenium
<a id="selenium"></a>

In [3]:
# Install package
import sys
!{sys.executable} -m pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.0.1 webdriver_manager-4.0.2



[notice] A new release of pip is available: 23.3.1 -> 25.0.1
[notice] To update, run: C:\Users\desti\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [11]:
# Basic browser interaction

# import relevant libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from time import sleep

# Define URL
url = "https://ecommerce-playground.lambdatest.io/index.php?route=account/register"

# open a browser on chrome
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.maximize_window()
driver.get(url)

# enter the first name field
first_name = driver.find_element(By.XPATH, '//*[@id="input-firstname"]')
first_name.send_keys("Email") #fill out data

# enter the last name field
last_name = driver.find_element(By.XPATH, '//*[@id="input-lastname"]')
last_name.send_keys("Kam") #fill out data

# enter the email field
email = driver.find_element(By.XPATH, '//*[@id="input-email"]')
email.send_keys("kam@email.on.dog") #fill out data

# enter the telephone field
first_name = driver.find_element(By.XPATH, '//*[@id="input-telephone"]')
first_name.send_keys("Email") #fill out data

# enter the password field
password = driver.find_element(By.XPATH, '//*[@id="input-password"]')
password.send_keys("1234") #fill out data

# enter the password confirmed field
password_confirm = driver.find_element(By.XPATH, '//*[@id="input-confirm"]')
password_confirm.send_keys("1234") #fill out data

# click to the newsletter subscribe button
newsletter_subscribe = driver.find_element(By.XPATH, '//*[@id="content"]/form/fieldset[3]/div/div/div[2]/label')
newsletter_subscribe.click() #fill out data

# click agree button
agree = driver.find_element(By.XPATH, '//*[@id="content"]/form/div/div/div/label')
agree.click() #fill out data

# click continue button
continue_button = driver.find_element(By.XPATH, '//*[@id="content"]/form/div/div/input')
continue_button.click() #fill out data

#scroll down by 200 units to view the lower part of the page
driver.execute_script("window.scrollTo(0, window.scrollY + 200)")

# pause for 5 sec to view the result
sleep(5)

# close the drver
driver.quit()


In [13]:
# drag and drop

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep

# Define URL
url = "http://dhtmlgoodies.com/scripts/drag-drop-custom/demo-drag-drop-3.html"

# open a browser on chrome
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.maximize_window()
driver.get(url)

# find the source for drag
source = driver.find_element(By.XPATH, '//*[@id="box3"]')

# find the destination for drag
destination = driver.find_element(By.XPATH, '//*[@id="box103"]')

# perform the drag and drop
actions = ActionChains(driver)
actions.drag_and_drop(source, destination).perform()

# pause for 5 sec to run
sleep(5)

# close the drver
driver.quit()

In [17]:
# Explicit wait

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep

# Define URL
url = "https://the-internet.herokuapp.com/dynamic_controls"

# open a browser on chrome
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.maximize_window()
driver.get(url)

# define a wait
wait = WebDriverWait(driver, 10)

# find the Enable button and click
enable_button = driver.find_element(By.XPATH, '//*[@id="input-example"]/button')
enable_button.click()
sleep(3)

# disable the button after load time
disable_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="input-example"]/button')))
disable_button.click()
sleep(3)

# find the remove button and click
remove_button = driver.find_element(By.XPATH, '//*[@id="checkbox-example"]/button')
remove_button.click()
sleep(3)

# find the add button and click
add_button = driver.find_element(By.XPATH, '//*[@id="checkbox-example"]/button')
add_button.click()
sleep(3)

# click the checkbox after the load time 
checkbox = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="checkbox"]')))
checkbox.click()
sleep(3)

# close the drver
driver.quit()

## 4. API
<a id="API"></a>


In [1]:
# API request
import requests

# Define URL
url = "https://api.upcitemdb.com/prod/trial/lookup"

# define parameters
parameters = {"upc": "025000044908"}

# make API request, passing in base URL and parameters
response = requests.get(url, params=parameters)

# print out the response URL
print(response.url)


https://api.upcitemdb.com/prod/trial/lookup?upc=025000044908


In [7]:
# Parse API response in Python with JSON
# API request
import requests
import json

# Example 1: lemonade with raspberry
print("\nProduct example 1\n")
# Define URL
url = "https://api.upcitemdb.com/prod/trial/lookup"

# define parameters
parameters = {"upc": "025000044908"}

# make API request, passing in base URL and parameters
response = requests.get(url, params=parameters)

# parse the text from the API repsonse using JSON schema
info = json.loads(response.text)

#print the title and brand of the product
item = info["items"][0]
title = item["title"]
brand = item["brand"]
print("title:", title)
print("brand:", brand)

# Example 2: lemonade with raspberry
print("\nProduct example 2\n")
# Define URL
url = "https://api.upcitemdb.com/prod/trial/lookup"

# define parameters
parameters = {"upc": "028400516686"}

# make API request, passing in base URL and parameters
response = requests.get(url, params=parameters)

# parse the text from the API repsonse using JSON schema
info = json.loads(response.text)

#print the title and brand of the product
item = info["items"][0]
title = item["title"]
brand = item["brand"]
print("title:", title)
print("brand:", brand)


Product example 1

title: Simply Lemonade w/ Raspberry Bottle, 52 fl oz
brand: SIMPLY

Product example 2

title: Ruffles Potato Chips Original Snack Chips  8.5 Ounce Bag
brand: Ruffles


In [19]:
# Use API key 
import requests

# Define URL
url = "http://api.openweathermap.org/data/2.5/forecast"

# define parameters
parameters = {"q":"Hong Kong,HK", "appid":"1bf20ea9960dc27c6711a92a66bc51e4"}

# make API request, passing in base URL and parameters
response = requests.get(url, params=parameters)

# print out the response URL
print(response.text)


{"cod":"200","message":0,"cnt":40,"list":[{"dt":1739620800,"main":{"temp":294.25,"feels_like":294.5,"temp_min":292.02,"temp_max":294.25,"pressure":1014,"sea_level":1014,"grnd_level":1009,"humidity":80,"temp_kf":2.23},"weather":[{"id":800,"main":"Clear","description":"clear sky","icon":"01n"}],"clouds":{"all":1},"wind":{"speed":2.06,"deg":166,"gust":2.1},"visibility":10000,"pop":0,"sys":{"pod":"n"},"dt_txt":"2025-02-15 12:00:00"},{"dt":1739631600,"main":{"temp":293.07,"feels_like":293.34,"temp_min":291.92,"temp_max":293.07,"pressure":1015,"sea_level":1015,"grnd_level":1011,"humidity":85,"temp_kf":1.15},"weather":[{"id":800,"main":"Clear","description":"clear sky","icon":"01n"}],"clouds":{"all":2},"wind":{"speed":0.49,"deg":208,"gust":0.7},"visibility":10000,"pop":0,"sys":{"pod":"n"},"dt_txt":"2025-02-15 15:00:00"},{"dt":1739642400,"main":{"temp":291.76,"feels_like":291.97,"temp_min":291.76,"temp_max":291.76,"pressure":1016,"sea_level":1016,"grnd_level":1010,"humidity":88,"temp_kf":0},"w

In [None]:
# Connect API
import requests

# Define URL
url = "http://api.openweathermap.org/data/2.5/forecast"

# define parameters
parameters = {"q":"Hong Kong,HK", "appid":"1bf20ea9960dc27c6711a92a66bc51e4"}

# make API request, passing in base URL and parameters
response = requests.get(url, params=parameters)


