In [10]:
import os
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import IntegerType,DoubleType
from pyspark.sql.functions import col


In [32]:
input = "input_data"
output = "output_data"
input_path = os.path.join(os.curdir, input)
output_path = os.path.join(os.curdir, output)
if not os.path.exists(input_path):
    os.mkdir(input_path)
    print(f"Directory created: {input}")

if not os.path.exists(output_path):
    os.mkdir(output_path)
    print(f"Directory created: {output}")

Directory created: input_data
Directory created: output_data


In [85]:
def get_category(url, file_name):
    product_list = []

    pages = 150

    for page in range(1, pages):
        resp = requests.get(url+f"?page={page}")
        content = BeautifulSoup(resp.content, "html.parser")
        for product in content.select("div.product__content"):
            try:
                data = {
                    "Title": product.select("span.product__title")[0].get_text().strip(),
                    "Location": product.select("p.product__location")[0].get_text().strip(),
                    "description": product.select("p.product__description")[0].get_text().strip(),
                    "Condition": product.select("div.product__tags span")[0].get_text().strip()
                }
            except IndexError:
                continue
        
            product_list.append(data)
        time.sleep(2)
    dataframe = pd.DataFrame(product_list)
    print(dataframe)
    dataframe.to_csv(file_name)

url ="https://tonaton.com/c_vehicles"

get_category(url, f"pro.csv")

           Title                      Location  \
0     GH₵ 22,000  Ashanti, Kumasi Metropolitan   
1        GH₵ 450   Greater Accra, Abossey Okai   
2        GH₵ 160        Greater Accra, Spintex   
3        GH₵ 350   Greater Accra, Abossey Okai   
4      GH₵ 2,200       Greater Accra, Darkuman   
...          ...                           ...   
2634   GH₵ 1,800   Greater Accra, Abossey Okai   
2635     GH₵ 600   Greater Accra, Abossey Okai   
2636   GH₵ 1,200   Greater Accra, Abossey Okai   
2637     GH₵ 500   Greater Accra, Abossey Okai   
2638     GH₵ 750   Greater Accra, Abossey Okai   

                                            description      Condition  
0         Toyota Corolla 1.6 Sedan Automatic 2002 Black  Ghanaian Used  
1                 Car Mp5 Player With Bluetooth and GPS      Brand New  
2                                        Foyu Cam Coder      Brand New  
3                                            Sun Shades           Used  
4                            Origi

In [6]:

conf = SparkConf() \
    .setAppName("jiji-scraper-etl") \
    .setMaster("local") \
    .set("spark.driver.extraClassPath","c:/pyspark/*")
sc = SparkContext.getOrCreate(conf=conf)
etl = SparkSession(sc)

In [41]:
df=etl.read.option("header",True).csv("pro.csv")

In [42]:
df.toPandas()

Unnamed: 0,_c0,Title,Location,description,Condition
0,0,"GH₵ 22,000","Ashanti, Kumasi Metropolitan",Toyota Corolla 1.6 Sedan Automatic 2002 Black,Ghanaian Used
1,1,GH₵ 450,"Greater Accra, Abossey Okai",Car Mp5 Player With Bluetooth and GPS,Brand New
2,2,GH₵ 160,"Greater Accra, Spintex",Foyu Cam Coder,Brand New
3,3,GH₵ 350,"Greater Accra, Abossey Okai",Sun Shades,Used
4,4,"GH₵ 2,200","Greater Accra, Darkuman",Original Rims All in Sizes,Brand New
...,...,...,...,...,...
2637,2634,"GH₵ 1,800","Greater Accra, Abossey Okai",Highlander 2018 Boot Light,Brand New
2638,2635,GH₵ 600,"Greater Accra, Abossey Okai",Accent 2019 Front Grille,Brand New
2639,2636,"GH₵ 1,200","Greater Accra, Abossey Okai",Jeep/ Grand Cherokee Front Upper Arm,Brand New
2640,2637,GH₵ 500,"Greater Accra, Abossey Okai",Kia Canivan New Type,Used


In [43]:
df=df.select(
    "description",
    "Condition",
    "title",
    "Location"
).withColumnRenamed('title', 'Price').withColumnRenamed('description', 'Description').\
    withColumn('Price', regexp_replace('Price', r'[GH₵]', '')).withColumn("Price",col("Price").cast(DoubleType()))

In [44]:
df.printSchema()

root
 |-- Description: string (nullable = true)
 |-- Condition: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Location: string (nullable = true)



In [45]:
df = df.toPandas()
df.to_csv(output_path+'/jiji_output.csv')