In [1]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("PokemonAnalysis").getOrCreate()

# Load the Pokémon dataset
df = spark.read.csv('/app/Pokemon.csv', header=True, inferSchema=True)

# Show the first few rows of the dataset
df.show()


+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|
|  4|          Charmander|  Fire|  NULL|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon|  Fire|  NULL|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Cha

In [2]:
from pyspark.sql.functions import col

# Filter non-legendary Pokémon and sort by Total stats
top_5_strongest = df.filter(col("Legendary") == "False").sort(col("Total").desc()).limit(5)
top_5_strongest.show()


+---+--------------------+------+-------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1| Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+-------+-----+---+------+-------+-------+-------+-----+----------+---------+
|248|TyranitarMega Tyr...|  Rock|   Dark|  700|100|   164|    150|     95|    120|   71|         2|    false|
|373|SalamenceMega Sal...|Dragon| Flying|  700| 95|   145|    130|    120|     90|  120|         3|    false|
|376|MetagrossMega Met...| Steel|Psychic|  700| 80|   145|    150|    105|    110|  110|         3|    false|
|445|GarchompMega Garc...|Dragon| Ground|  700|108|   170|    115|    120|     95|   92|         4|    false|
|289|             Slaking|Normal|   NULL|  670|150|   160|    100|     95|     65|  100|         3|    false|
+---+--------------------+------+-------+-----+---+------+-------+-------+-------+-----+----------+---------+



In [3]:
from pyspark.sql.functions import avg

# Group by Type 1, calculate average HP, and sort
highest_avg_hp = df.groupBy("Type 1").agg(avg("HP").alias("Avg_HP")).sort(col("Avg_HP").desc()).limit(1)
highest_avg_hp.show()


+------+-------+
|Type 1| Avg_HP|
+------+-------+
|Dragon|83.3125|
+------+-------+



In [8]:
from pyspark.sql.functions import col

# Assuming 'Sp. Atk' refers to a specific statistic, not a move
# Corrected query with backticks around `Sp. Atk`
most_common_sp_atk = df.groupBy("`Sp. Atk`").count().sort(col("count").desc()).limit(1)
most_common_sp_atk.show()


+-------+-----+
|Sp. Atk|count|
+-------+-----+
|     60|   51|
+-------+-----+

