In [10]:
import requests
import numpy
import json

from pyspark.sql.types import IntegerType, StringType, DateType, DoubleType, LongType, StructType, StructField
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, monotonically_increasing_id, current_date, udf
from datetime import date, datetime

In [11]:
today = date.today().strftime('%d/%m/%Y')

spark = SparkSession \
    .builder \
    .appName("Pokemon Go ETL") \
    .master("local") \
    .getOrCreate()

context = spark.sparkContext

In [12]:
dataset = requests.get('https://pogoapi.net/api/v1/fast_moves.json').json()

In [17]:
pokemon_schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('stamina_loss_scaler', DoubleType(), True),
    StructField('name', StringType(), True),
    StructField('power', LongType(), True),
    StructField('duration', LongType(), True),
    StructField('energy_delta', LongType(), True),
    StructField('type', StringType(), True),
    StructField('total_damage', LongType(), True),
    StructField('created_at', DateType(), True)
])

dataframe = spark.createDataFrame(dataset, pokemon_schema)

dataframe = dataframe \
    .withColumn('id', monotonically_increasing_id() + 1) \
    .withColumn('total_damage', col('power') * col('duration')) \
    .withColumn('created_at', current_date())

dataframe.show(5)

+---+-------------------+-------------+-----+--------+------------+------+------------+----------+
| id|stamina_loss_scaler|         name|power|duration|energy_delta|  type|total_damage|created_at|
+---+-------------------+-------------+-----+--------+------------+------+------------+----------+
|  1|               0.01|  Fury Cutter|    3|     400|           6|   Bug|        1200|2020-09-03|
|  2|               0.01|     Bug Bite|    5|     500|           6|   Bug|        2500|2020-09-03|
|  3|               0.01|         Bite|    6|     500|           4|  Dark|        3000|2020-09-03|
|  4|               0.01| Sucker Punch|    7|     700|           8|  Dark|        4900|2020-09-03|
|  5|               0.01|Dragon Breath|    6|     500|           4|Dragon|        3000|2020-09-03|
+---+-------------------+-------------+-----+--------+------------+------+------------+----------+
only showing top 5 rows



In [18]:
dataframe.printSchema()

root
 |-- id: long (nullable = false)
 |-- stamina_loss_scaler: double (nullable = true)
 |-- name: string (nullable = true)
 |-- power: long (nullable = true)
 |-- duration: long (nullable = true)
 |-- energy_delta: long (nullable = true)
 |-- type: string (nullable = true)
 |-- total_damage: long (nullable = true)
 |-- created_at: date (nullable = false)



In [19]:
dataframe.toPandas().to_csv('pokemon.csv', index=False)

In [20]:
df = spark.read.csv('pokemon.csv', header=True)

In [21]:
df.show(5)

+---+-------------------+-------------+-----+--------+------------+------+------------+----------+
| id|stamina_loss_scaler|         name|power|duration|energy_delta|  type|total_damage|created_at|
+---+-------------------+-------------+-----+--------+------------+------+------------+----------+
|  1|               0.01|  Fury Cutter|    3|     400|           6|   Bug|        1200|2020-09-03|
|  2|               0.01|     Bug Bite|    5|     500|           6|   Bug|        2500|2020-09-03|
|  3|               0.01|         Bite|    6|     500|           4|  Dark|        3000|2020-09-03|
|  4|               0.01| Sucker Punch|    7|     700|           8|  Dark|        4900|2020-09-03|
|  5|               0.01|Dragon Breath|    6|     500|           4|Dragon|        3000|2020-09-03|
+---+-------------------+-------------+-----+--------+------------+------+------------+----------+
only showing top 5 rows

