In [2]:
pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 42 kB/s s eta 0:00:01     |█████████████████████████▍      | 223.6 MB 37.4 MB/s eta 0:00:02
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 33.6 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845513 sha256=0a55c881afac086d37decae8a5edf62a0e5d25031b8a43e647fe7980cbbd7947
  Stored in directory: /home/emr-notebook/.cache/pip/wheels/42/59/f5/79a5bf931714dcd201b26025347785f087370a10a3329a899c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1
Note: you may need to restart the

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import os
import configparser
import pyspark.sql.functions as F


In [2]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

In [5]:
EVENT_DATA = "s3a://udacity-dend/log_data/2018/11/2018-11-13-events.json"

In [6]:
# Define the schema for the JSON data
json_schema = StructType([
    StructField("artist", StringType(), True),
    StructField("auth", StringType(), True),
    StructField("firstName", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("itemInSession", ByteType(), True),
    StructField("lastName", StringType(), True),
    StructField("length", DoubleType(), True),
    StructField("level", StringType(), True),
    StructField("location", StringType(), True),
    StructField("method", StringType(), True),
    StructField("page", StringType(), True),
    StructField("registration", DoubleType(), True),
    StructField("sessionId", IntegerType(), True),
    StructField("song", StringType(), True),
    StructField("status", IntegerType(), True),
    StructField("ts", LongType(), True),
    StructField("userAgent", StringType(), True),
    StructField("userId", StringType(), True),
])

In [7]:
df_events = spark.read.json(EVENT_DATA, schema=json_schema)

IllegalArgumentException: For input string: "64M"

# Debugging 64M issue

### Trying a simple csv file.  This came from the Udacity notes and has been suggested by the mentors

In [8]:
df = spark.read.csv("s3a://udacity-dend/pagila/payment/payment.csv",sep=";", inferSchema=True, header=True)

IllegalArgumentException: For input string: "64M"

### Trying the same without inferring the schema

In [9]:
df = spark.read.csv("s3a://udacity-dend/pagila/payment/payment.csv",sep=";", header=True)

IllegalArgumentException: For input string: "64M"

### Trying a newer version of hadoop:aws (as per [this stack post](https://stackoverflow.com/questions/74017684/illegalargumentexception-creating-spark-session))
I am not totally sure how this works.  Looking [here](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop.html) I think the latest version of hadoop-aws compatible with emr-5.36.0 (which I am using) 2.10.1.  **I have tried that**.  I have also tried various sub-versions of 3.3.  **None of this makes any difference**

Note that I have also tried the nonsense value `org.apache.hadoop:hadoop-aws:999.999.999` - which **does not** bomb out but just gives the same 64M error.  So I am not certain that altering this value is actually doing anything anyway.

In [15]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:999.999.999")\
                     .getOrCreate()

df = spark.read.csv("s3a://udacity-dend/pagila/payment/payment.csv",sep=";", header=True)

IllegalArgumentException: For input string: "64M"

### Cluster  Setup - see repo for screenshots

In [None]:
df_events.count()

In [None]:
df_events.printSchema()

In [None]:
df_events.limit(2).toPandas()

### Limit df to song plays

In [None]:
df_events = df_events.filter(df_events.page=='NextSong') 

### Convert ts to timestamp

In [None]:
# with reference to https://stackoverflow.com/questions/53537226/pyspark-from-unixtime-not-showing-the-correct-datetime

df_events = df_events.withColumn(
    'timeStamp',
   F.from_unixtime(df_events.ts / 1000,"yyyy-MM-dd HH:mm:ss:SSS")
)

In [None]:
df_events.limit(2).toPandas()

# Import Artists

In [7]:
SONG_DATA = "s3a://udacity-dend/song_data/A/A/A/"

In [8]:
df_artists = spark.read.json(SONG_DATA, schema=json_schema)
df_artists = spark.read.json(SONG_DATA)

IllegalArgumentException: For input string: "64M"

In [8]:
df_artists.count()

NameError: name 'df_artists' is not defined

In [None]:
df_artists.printSchema()

In [None]:
df_artists.limit(5).toPandas()

In [None]:
# Define the schema for the JSON data
artist_json_schema = StructType([
    StructField("artist_id", StringType(), True),
    StructField("artist_latitude", DoubleType(), True),
    StructField("artist_location", StringType(), True),
    StructField("artist_longitude", DoubleType(), True),
    StructField("artist_name", StringType(), True),
    StructField("duration", DoubleType(), True),
    StructField("num_songs", IntegerType(), True),
    StructField("song_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("year", IntegerType(), True),
])

In [11]:
# Define the schema for the JSON data
artist_json_schema = StructType([
    StructField("artist_id", StringType(), True),
    StructField("artist_latitude", DoubleType(), True),
    StructField("artist_location", StringType(), True),
    StructField("artist_longitude", DoubleType(), True),
    StructField("artist_name", StringType(), True),
])

In [11]:
# Define the schema for the JSON data
artist_json_schema = StructType([
    StructField("artist_id", StringType(), True),
    StructField("artist_latitude", StringType(), True),
    StructField("artist_location", StringType(), True),
    StructField("artist_longitude", StringType(), True),
    StructField("artist_name", StringType(), True),
])

In [12]:
df_artists = spark.read.json(SONG_DATA, schema=artist_json_schema)

IllegalArgumentException: For input string: "64M"

In [None]:
df_artists.count()

In [None]:
df_artists = df_songs.drop_duplicates()

In [None]:
df_artists.count()

In [None]:
df_artists.printSchema()

In [None]:
df_artists.limit(5).toPandas()

# Import Songs

In [9]:
# Define the schema for the JSON data
song_json_schema = StructType([
    StructField("duration", DoubleType(), True),
    StructField("num_songs", IntegerType(), True),
    StructField("song_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("year", IntegerType(), True),
])

In [10]:
df_songs = spark.read.json(SONG_DATA, schema=song_json_schema)

IllegalArgumentException: For input string: "64M"

In [None]:
df_songs.count()

In [None]:
df_songs = df_songs.drop_duplicates()

In [None]:
df_songs.count()

In [None]:
df_songs.printSchema()

In [None]:
df_songs.limit(5).toPandas()