In [102]:
# Dependencies
import csv
import pandas as pd
import os
import numpy as np
from datetime import datetime

In [103]:
# Read csv
csvpath = os.path.join('Desktop', 'Project-2', 'spotify_data.csv')
import csv

spotify_df = pd.read_csv(csvpath, encoding="ISO-8859-1")

In [104]:
# Add a normalizing column for each song in each region to convert to streams per million of populace
spotify_df["Streams per 1 million"] = (spotify_df["Streams"]/spotify_df["Population"]*1000000).round(2)

In [105]:
# Drop the obsolete "Stream % of Pop" column
del spotify_df['Stream % of Pop']

In [106]:
# Add a blank column to flag number one hits
spotify_df['Number One']=np.nan

In [107]:
# Make sure dates are formatted as such
spotify_df['Date'] = pd.to_datetime(spotify_df['Date'])
spotify_df['RELEASE'] = pd.to_datetime(spotify_df['RELEASE'])

In [108]:
# Add weekday and weeknumber columns
spotify_df["Weekday"] = spotify_df["Date"].dt.weekday_name
spotify_df["Weeknumber"] = spotify_df["Date"].dt.week

In [109]:
spotify_df.head()

Unnamed: 0,Date,Position,Streams,Track Name,Artist,Region,URL,RELEASE,BPM,ENERGY,...,VALENCE,LENGTH,ACOUSTIC,POP.,Population,Country Primary Language,Streams per 1 million,Number One,Weekday,Weeknumber
0,2017-01-01,1,19272,Reggaetn Lento (Bailemos),CNCO,Ecuador,https://open.spotify.com/track/3AEZUABDXNtecAO...,2016-08-26,94,84,...,71,0.154861,40,81,16290913,Spanish,1182.99,,Sunday,52
1,2017-01-01,2,19270,Chantaje,Shakira,Ecuador,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,2017-05-26,102,77,...,91,0.136111,19,83,16290913,Spanish,1182.87,,Sunday,52
2,2017-01-01,3,15761,Otra Vez (feat. J Balvin),Zion & Lennox,Ecuador,https://open.spotify.com/track/3QwBODjSEzelZyV...,2016-09-30,96,77,...,70,0.145139,6,81,16290913,Spanish,967.47,,Sunday,52
3,2017-01-01,4,14954,Vente Pa' Ca,Ricky Martin,Ecuador,https://open.spotify.com/track/7DM4BPaS7uofFul...,2016-09-23,100,92,...,53,0.179861,0,80,16290913,Spanish,917.94,,Sunday,52
4,2017-01-01,5,14269,Safari,J Balvin,Ecuador,https://open.spotify.com/track/6rQSrBHf7HlZjtc...,2016-06-24,180,69,...,63,0.143056,53,73,16290913,Spanish,875.89,,Sunday,52


In [110]:
# Create a new dataframe that eliminates Chile, Estonia, and Latvia (because we are missing dates for their data)
spotify_reduced_df = spotify_df[~spotify_df['Region'].isin(['Chile', 'Estonia', 'Latvia'])]

In [111]:
spotify_sorted_df = spotify_reduced_df.sort_values(by=['Date', 'Region', 'Position'])

In [112]:
spotify_week_day_df = spotify_sorted_df.reset_index(drop=True)

In [113]:
# Remove dates that give us a rollover of week numbers
# spotify_week_day_df.loc[datetime.date(year=2017,month=1,day=2):datetime.date(year=2017,month=12,day=31)]

In [114]:
spotify_week_day_df.head()

Unnamed: 0,Date,Position,Streams,Track Name,Artist,Region,URL,RELEASE,BPM,ENERGY,...,VALENCE,LENGTH,ACOUSTIC,POP.,Population,Country Primary Language,Streams per 1 million,Number One,Weekday,Weeknumber
0,2017-01-01,1,253019,Chantaje,Shakira,Argentina,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,2017-05-26,102,77,...,91,0.136111,19,83,44293293,Spanish,5712.35,,Sunday,52
1,2017-01-01,2,223988,Vente Pa' Ca,Ricky Martin,Argentina,https://open.spotify.com/track/7DM4BPaS7uofFul...,2016-09-23,100,92,...,53,0.179861,0,80,44293293,Spanish,5056.93,,Sunday,52
2,2017-01-01,3,210943,Reggaetn Lento (Bailemos),CNCO,Argentina,https://open.spotify.com/track/3AEZUABDXNtecAO...,2016-08-26,94,84,...,71,0.154861,40,81,44293293,Spanish,4762.41,,Sunday,52
3,2017-01-01,4,173865,Safari,J Balvin,Argentina,https://open.spotify.com/track/6rQSrBHf7HlZjtc...,2016-06-24,180,69,...,63,0.143056,53,73,44293293,Spanish,3925.31,,Sunday,52
4,2017-01-01,5,153956,Shaky Shaky,Daddy Yankee,Argentina,https://open.spotify.com/track/58IL315gMSTD37D...,2016-04-08,88,63,...,86,0.1625,6,68,44293293,Spanish,3475.83,,Sunday,52


In [115]:
spotify_by_day_df = spotify_week_day_df.groupby(["Date", "Weekday", "Weeknumber"]).agg({'Streams': ['sum'],
                                                                                      'Streams per 1 million': [sum]})
spotify_by_day_df = spotify_by_day_df.reset_index()

In [116]:
# SAVE A CSV OF THE TABLE ("spotify_week_day.csv")
spotify_by_day_df.to_csv("Desktop/Project-2/spotify_summary.csv", index=False, header=True)