In [115]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import IntegerType,DoubleType
from pyspark.sql.functions import col
from pyspark.sql import functions as F
import sys
import os


In [118]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
conf = SparkConf() \
    .setAppName("app") \
    .setMaster("local") \
    .set("spark.driver.extraClassPath","c:/pyspark/*") \
    .set("spark.driver.memory", "5g") \
   

sc = SparkContext.getOrCreate(conf=conf)
etl = SparkSession(sc)
etl

In [119]:
df=etl.read.option("header",True).csv("./output_data/jiji_output.csv")

In [120]:
df = df[[
    'id',
    'Description',
    'Condition',
    'Price',
    'Location'

]]
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Condition: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Location: string (nullable = true)



In [121]:
df.show()

+---+--------------------+------------+------+--------------------+
| id|         Description|   Condition| Price|            Location|
+---+--------------------+------------+------+--------------------+
|  0|Mercedes-Benz E30...|Foreign Used|355000|Greater Accra, Ea...|
|  1|Toyota Ignition Coil|        Used|   150|Greater Accra, Ab...|
|  2|        Toyota Tyres|   Brand New|   250|Greater Accra, Ab...|
|  3|     Ford F150 Tires|   Brand New|  1100|Greater Accra, Ab...|
|  4|Jack and Wheel Sp...|   Brand New|   200|Greater Accra, Ab...|
|  5|       Seat Covers20|   Brand New|  1000|Greater Accra, Ab...|
|  6|Luxury Black 9D S...|   Brand New|  1000|Greater Accra, Ab...|
|  7|All Kinds of Body...|   Brand New|   350|Greater Accra, Ac...|
|  8|  Lower Arm Bushings|   Brand New|   100|Greater Accra, Ab...|
|  9|(Corolla-2020)All...|        Used| 20000|Greater Accra, Ab...|
| 10|Rim 15 Ring Wheel...|   Brand New|   300|Greater Accra, Ab...|
| 11|   Gucci Seat Covers|   Brand New|   600|Gr

In [123]:
df.groupBy('Condition').count().sort('count').show()

+-------------+-----+
|    Condition|count|
+-------------+-----+
|          New|    1|
| Foreign Used|    6|
|Ghanaian Used|    7|
|         Used|  611|
|    Brand New| 1899|
+-------------+-----+



In [124]:
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).toPandas().head()


Unnamed: 0,id,Description,Condition,Price,Location
0,0,0,0,0,0


In [129]:
def udf_region(location):
    if location.startswith("Greater"):
        return "Greater Accra region"
    if location.startswith("Ashanti"):
        return "Ashanti region"
    if location.startswith("Eastern"):
        return "Eastern region"
    # else:
    #     return "Other region"

udf_region("Greater Accra")

'Greater Accra region'

In [136]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

udf = udf(udf_region, StringType())


df = df.withColumn('Regions', udf('Location')).drop('Location')

In [138]:
df.show()

+---+--------------------+------------+------+--------------------+--------------------+
| id|         Description|   Condition| Price|            Location|             Regions|
+---+--------------------+------------+------+--------------------+--------------------+
|  0|Mercedes-Benz E30...|Foreign Used|355000|Greater Accra, Ea...|Greater Accra region|
|  1|Toyota Ignition Coil|        Used|   150|Greater Accra, Ab...|Greater Accra region|
|  2|        Toyota Tyres|   Brand New|   250|Greater Accra, Ab...|Greater Accra region|
|  3|     Ford F150 Tires|   Brand New|  1100|Greater Accra, Ab...|Greater Accra region|
|  4|Jack and Wheel Sp...|   Brand New|   200|Greater Accra, Ab...|Greater Accra region|
|  5|       Seat Covers20|   Brand New|  1000|Greater Accra, Ab...|Greater Accra region|
|  6|Luxury Black 9D S...|   Brand New|  1000|Greater Accra, Ab...|Greater Accra region|
|  7|All Kinds of Body...|   Brand New|   350|Greater Accra, Ac...|Greater Accra region|
|  8|  Lower Arm Bush

In [139]:
df.groupBy('Regions').count().sort('count').show()

+--------------------+-----+
|             Regions|count|
+--------------------+-----+
|      Ashanti region|    2|
|                null|    6|
|Greater Accra region| 2516|
+--------------------+-----+

