In [None]:
import json
import boto3
import pandas as pd
from datetime import datetime
from io import StringIO

def album(all_info):
    album_info = []
    for item in range(0, len(all_info)):
        album_id = all_info[item]['track']['album']['id']
        album_name = all_info[item]['track']['album']['name']
        release_date = all_info[item]['track']['album']['release_date']
        total_tracks = all_info[item]['track']['album']['total_tracks']
        url = all_info[item]['track']['album']['external_urls']['spotify']
        album_dict = {'album_id':album_id, 'album_name':album_name, 'release_date':release_date, 'total_tracks':total_tracks, 'url':url}
        album_info.append(album_dict)
        item += 1
    return album_info

def artist(all_info):
    artist_info = []
    for item in range(0, len(all_info)):
        artist_id = all_info[item]['track']['artists'][0]['id']
        artist_name = all_info[item]['track']['artists'][0]['name']
        external_url = all_info[item]['track']['artists'][0]['external_urls']['spotify']
        artist_dict = {'artist_id':artist_id, 'artist_name':artist_name, 'external_url':external_url}
        artist_info.append(artist_dict)
        item += 1
    return artist_info

def song(all_info):
    song_info = []
    for item in range(0, len(all_info)):
        song_id = all_info[item]['track']['id']
        song_name = all_info[item]['track']['name']
        song_duration = all_info[item]['track']['duration_ms']
        song_url = all_info[item]['track']['href']
        song_popularity = all_info[item]['track']['popularity']
        song_added = all_info[item]['added_at']
        album_id = all_info[item]['track']['album']['id']
        artist_id = all_info[item]['track']['artists'][0]['id']
        song_dict = {'song_id':song_id, 'song_name':song_name, 'song_duration':song_duration,'song_url':song_url,'song_popularity':song_popularity,'song_added':song_added, 'album_id':album_id, 'artist_id':artist_id}
        song_info.append(song_dict)
        item += 1
    return song_info

def lambda_handler(event, context):
    s3 = boto3.client('s3')
    Bucket = 'spotify-etl-pipeline-sudeshna'
    Key = 'raw_data/unprocessed_data/'

    spotify_data_list = []
    spotify_data_keys = []
    for file in (s3.list_objects(Bucket=Bucket, Prefix=Key)['Contents']):
        file_key = file['Key']
        if file_key.endswith('.json'):
            response = s3.get_object(Bucket=Bucket, Key=file_key)
            content = (response['Body'])
            jsonObject = json.loads(content.read())
            spotify_data_list.append(jsonObject)
            spotify_data_keys.append(file_key)
    
    transformed_filename = []
    for keys in spotify_data_keys:
        filename = keys.split('/')[-1]
        filename_split = filename.split('.')[0]
        transformed_filename.append(filename_split)

    item = 0
    for data in spotify_data_list:
        album_info = album(data['items'])
        artist_info = artist(data['items'])
        song_info = song(data['items'])

        album_df = pd.DataFrame.from_dict(album_info)
        artist_df = pd.DataFrame(artist_info)
        song_df = pd.DataFrame(song_info)

        album_df = album_df.drop_duplicates(subset=['album_id'])
        artist_df = artist_df.drop_duplicates(subset=['artist_id'])
        song_df = song_df.drop_duplicates(subset=['song_id'])

        album_df['release_date'] = pd.to_datetime(album_df['release_date'])
        song_df['song_added'] = pd.to_datetime(song_df['song_added'])

        #album 
        album_key = "transformed_data/album_data/album_info_" + transformed_filename[item] + ".csv"
        album_buffer = StringIO()
        album_df.to_csv(album_buffer, index=False)
        album_content = album_buffer.getvalue()
        s3.put_object(Bucket=Bucket, Key=album_key, Body=album_content)

        #artist
        artist_key = "transformed_data/artist_data/artist_info_" + transformed_filename[item] + ".csv"
        artist_buffer = StringIO()
        artist_df.to_csv(artist_buffer, index=False)
        artist_content = artist_buffer.getvalue()
        s3.put_object(Bucket=Bucket, Key=artist_key, Body=artist_content)
        
        #song
        song_key = "transformed_data/song_data/song_info_" + transformed_filename[item] + ".csv"
        song_buffer = StringIO()
        song_df.to_csv(song_buffer, index=False)
        song_content = song_buffer.getvalue()
        s3.put_object(Bucket=Bucket, Key=song_key, Body=song_content)
        
        item += 1

    s3_client = boto3.client('s3')
    for key in spotify_data_keys:
        copy_source = {'Bucket': Bucket, 'Key': key}
        s3_client.copy_object(CopySource = copy_source, Bucket = Bucket, Key = 'raw_data/processed_data/' + key.split('/')[-1])
        s3_client.delete_object(Bucket = Bucket, Key = key)

    

    


