In [1]:
import requests
import json

from pyspark.sql.types import IntegerType, StringType, StructType, StructField
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import monotonically_increasing_id, row_number

In [73]:
spark = SparkSession \
    .builder \
    .appName("Pokemon Go ELT") \
    .master("local") \
    .getOrCreate()

In [74]:
response = requests.get('https://pogoapi.net/api/v1/pokemon_stats.json')

if response.status_code == 500:
    print('O servidor não está acessível, erro 500.')
    
pokemon_stats = response.json()

print(json.dumps(pokemon_stats, indent=4))

[
    {
        "base_attack": 118,
        "base_defense": 111,
        "base_stamina": 128,
        "form": "Fall_2019",
        "pokemon_id": 1,
        "pokemon_name": "Bulbasaur"
    },
    {
        "base_attack": 118,
        "base_defense": 111,
        "base_stamina": 128,
        "form": "Normal",
        "pokemon_id": 1,
        "pokemon_name": "Bulbasaur"
    },
    {
        "base_attack": 118,
        "base_defense": 111,
        "base_stamina": 128,
        "form": "Purified",
        "pokemon_id": 1,
        "pokemon_name": "Bulbasaur"
    },
    {
        "base_attack": 118,
        "base_defense": 111,
        "base_stamina": 128,
        "form": "Shadow",
        "pokemon_id": 1,
        "pokemon_name": "Bulbasaur"
    },
    {
        "base_attack": 151,
        "base_defense": 143,
        "base_stamina": 155,
        "form": "Normal",
        "pokemon_id": 2,
        "pokemon_name": "Ivysaur"
    },
    {
        "base_attack": 151,
        "base_defense": 143,
  

In [75]:
pokemon_stats_schema = StructType([
    StructField('pokemon_id', IntegerType(), True),
    StructField('pokemon_name', StringType(), True)
])

pokemon_stats_dataframe = spark.createDataFrame(pokemon_stats, pokemon_stats_schema)

result = pokemon_stats_dataframe \
    .dropDuplicates(['pokemon_id', 'pokemon_name']) \
    .select((row_number().over(Window.orderBy(monotonically_increasing_id()))).alias('id'),
            col('pokemon_id'),
            col('pokemon_name'))

result.show(5)

+---+----------+------------+
| id|pokemon_id|pokemon_name|
+---+----------+------------+
|  1|       138|     Omanyte|
|  2|       354|     Banette|
|  3|       404|       Luxio|
|  4|       469|     Yanmega|
|  5|         8|   Wartortle|
+---+----------+------------+
only showing top 5 rows



In [76]:
result.printSchema()

root
 |-- id: integer (nullable = true)
 |-- pokemon_id: integer (nullable = true)
 |-- pokemon_name: string (nullable = true)



In [77]:
result.toPandas().to_csv('pokemon_stats.csv', index=False)

In [78]:
proof = spark.read.csv('pokemon_stats.csv', header=True)

In [79]:
proof.show(5)

+---+----------+------------+
| id|pokemon_id|pokemon_name|
+---+----------+------------+
|  1|       138|     Omanyte|
|  2|       354|     Banette|
|  3|       404|       Luxio|
|  4|       469|     Yanmega|
|  5|         8|   Wartortle|
+---+----------+------------+
only showing top 5 rows

