# Part I. ETL Pipeline for Pre-Processing the Files

## PLEASE RUN THE FOLLOWING CODE FOR PRE-PROCESSING THE FILES

In [None]:
import csv
import glob
import json
import os
import re
from pathlib import Path

import cassandra
import numpy as np
import pandas as pd

#### Creating list of filepaths to process original event csv data files

In [None]:
dataset_path = Path.cwd() / 'event_data'
for root, _, _ in os.walk(dataset_path):    
    dataset_files = sorted(Path(root).glob('*.csv'))

#### Processing the files to create the data file csv that will be used for Apache Casssandra tables

In [None]:
full_data_rows_list = []
for f in dataset_files:
    with open(f, 'r', encoding='utf8', newline='') as csv_file: 
        csv_reader = csv.reader(csv_file)         
        next(csv_reader) # skips CSV header line
        for line in csv_reader:
            full_data_rows_list.append(line) 

csv.register_dialect('events', quoting=csv.QUOTE_ALL, skipinitialspace=True)
# here we join all complete rows from the separate CSVs into one single CSV file
with open('event_datafile_new.csv', 'w', encoding='utf8', newline='') as f:
    writer = csv.writer(f, dialect='events')
    writer.writerow([
        'artist',
        'firstName',
        'gender',
        'itemInSession',
        'lastName',
        'length',
        'level',
        'location',
        'sessionId',
        'song',
        'userId'
    ])
    for row in full_data_rows_list:
        if (row[0] == ''):
            continue
        writer.writerow((
            row[0], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[12], row[13], row[16]))


In [None]:
with open('event_datafile_new.csv', 'r', encoding='utf8') as f:
    print(sum(1 for line in f))

# Part II. Complete the Apache Cassandra coding portion of your project. 

## Now you are ready to work with the CSV file titled <font color=red>event_datafile_new.csv</font>, located within the Workspace directory.  The event_datafile_new.csv contains the following columns: 
- artist 
- firstName of user
- gender of user
- item number in session
- last name of user
- length of the song
- level (paid or free song)
- location of the user
- sessionId
- song title
- userId

The image below is a screenshot of what the denormalized data should appear like in the <font color=red>**event_datafile_new.csv**</font> after the code above is run:<br>

<img src="images/image_event_datafile_new.jpg">

In [None]:
from cassandra.cluster import Cluster

cluster = Cluster()
session = cluster.connect()

In [None]:
create_keyspace_query = """
CREATE KEYSPACE IF NOT EXISTS sparkify
    WITH REPLICATION = { 
        'class' : 'SimpleStrategy', 
        'replication_factor' : 1 
    };
"""
session.execute(create_keyspace_query)

In [None]:
session.set_keyspace('sparkify')


In [None]:
create_table_01 = """
CREATE TABLE IF NOT EXISTS songplay_events_by_session (
    session_id INT,
    session_item INT,
    artist_name TEXT,
    song_title TEXT,
    song_length FLOAT,
    user_id INT,
    user_first_name TEXT,
    user_last_name TEXT,
    user_gender TEXT,
    user_location TEXT,
    user_plan TEXT,
    PRIMARY KEY ((session_id), session_item, user_id)
);
"""
session.execute(create_table_01)

create_table_02 = """
CREATE TABLE IF NOT EXISTS songplay_events_by_user (
    session_id INT,
    session_item INT,
    artist_name TEXT,
    song_title TEXT,
    song_length FLOAT,
    user_id INT,
    user_full_name TEXT,
    user_gender TEXT,
    user_location TEXT,
    user_plan TEXT,
    PRIMARY KEY ((user_id), session_id, session_item)
);
"""
session.execute(create_table_02)

create_table_03 = """
CREATE TABLE IF NOT EXISTS songplay_events_by_song (
    session_id INT,
    session_item INT,
    artist_name TEXT,
    song_title TEXT,
    song_length FLOAT,
    user_id INT,
    user_full_name TEXT,
    user_gender TEXT,
    user_location TEXT,
    user_plan TEXT,
    PRIMARY KEY ((song_title), session_id, session_item, user_id)
);
"""
session.execute(create_table_03)

In [None]:
filename = 'event_datafile_new.csv'

insert_query = """
INSERT INTO songplay_events_by_session (
    session_id, 
    session_item, 
    artist_name, 
    song_title, 
    song_length, 
    user_id, 
    user_first_name, 
    user_last_name, 
    user_gender, 
    user_location, 
    user_plan
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
"""
insert_stmt_table_01 = session.prepare(insert_query)

insert_query = """
INSERT INTO {} (
    session_id, 
    session_item, 
    artist_name, 
    song_title, 
    song_length, 
    user_id, 
    user_full_name, 
    user_gender, 
    user_location, 
    user_plan
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
"""
insert_stmt_table_02 = session.prepare(insert_query.format('songplay_events_by_user'))
insert_stmt_table_03 = session.prepare(insert_query.format('songplay_events_by_song'))

with open(filename, encoding='utf8') as f:
    csv_reader = csv.reader(f)
    next(csv_reader) # skips CSV header line
    for line in csv_reader:
        row = [
            int(line[8]), 
            int(line[3]), 
            line[0], 
            line[9], 
            float(line[5]), 
            int(line[10]), 
            line[1], 
            line[4], 
            line[2], 
            line[7], 
            line[6]
        ]
        session.execute(insert_stmt_table_01, row)
        # tables 02 and 03 have a slightly different schema
        row = [
            int(line[8]), 
            int(line[3]), 
            line[0], 
            line[9], 
            float(line[5]), 
            int(line[10]), 
            '{} {}'.format(line[1], line[4]), # first and last name
            line[2], 
            line[7], 
            line[6]
        ]
        session.execute(insert_stmt_table_02, row)
        session.execute(insert_stmt_table_03, row)

In [None]:
# result = session.execute('SELECT COUNT(*) FROM songplay_events_by_session;') # bad
# print(result.one().count)

# result = session.execute('SELECT COUNT(*) FROM songplay_events_by_user;') # bad
# print(result.one().count)

# result = session.execute('SELECT COUNT(*) FROM songplay_events_by_song;') # bad
# print(result.one().count)


In [None]:
query_01 = """
SELECT artist_name, song_title, song_length
FROM songplay_events_by_session
WHERE session_id = ? AND session_item = ?;
"""
query_01_stmt = session.prepare(query_01)
result = session.execute(query_01_stmt, [338, 4])
row = result.one()
print('artist: {}\nsong: {}\nlength: {:.2f}\n'.format(row.artist_name, row.song_title, row.song_length))


In [None]:
query_02 = """
SELECT artist_name, song_title, user_full_name
FROM songplay_events_by_user
WHERE user_id = ? AND session_id = ?;
"""
query_02_stmt = session.prepare(query_02)
result = session.execute(query_02_stmt, [10, 182])
rows = result.all()
for row in rows:
    print('artist: {}\nsong: {}\nuser: {}\n'.format(row.artist_name, row.song_title, row.user_full_name))      

In [None]:
query_03 = """
SELECT user_full_name 
FROM songplay_events_by_song
WHERE song_title = ?;
"""
query_03_stmt = session.prepare(query_03)
result = session.execute(query_03_stmt, ['All Hands Against His Own'])
rows = result.all()
for row in rows:
    print('{}\n'.format(row.user_full_name)) 

In [None]:
# NOTE: This is an alternative version of table 03 using a materialized view (experimental)
create_view = """
CREATE MATERIALIZED VIEW IF NOT EXISTS songplay_events_by_song_mv AS 
    SELECT song_title, artist_name, user_id, user_full_name, session_id, session_item 
    FROM songplay_events_by_user 
    WHERE song_title IS NOT NULL AND session_id IS NOT NULL AND session_item IS NOT NULL AND user_id IS NOT NULL
    PRIMARY KEY ((song_title), user_id, session_id, session_item);
"""
session.execute(create_view)                

In [None]:
query_03_alt = """
SELECT user_full_name 
FROM songplay_events_by_song_mv
WHERE song_title = ?;
"""
query_03_stmt = session.prepare(query_03_alt)
result = session.execute(query_03_stmt, ['All Hands Against His Own'])
rows = result.all()
for row in rows:
    print('{}\n'.format(row.user_full_name)) 

In [36]:
drop_stmt = [
    'DROP TABLE IF EXISTS songplay_events_by_session;',    
    'DROP TABLE IF EXISTS songplay_events_by_song;',
    'DROP MATERIALIZED VIEW IF EXISTS songplay_events_by_song_mv;',
    'DROP TABLE IF EXISTS songplay_events_by_user;'
]
for stmt in drop_stmt:
    session.execute(stmt)

In [37]:
session.shutdown()
cluster.shutdown()