In [1]:
# Add the dependencies
import numpy as np
import pandas as pd
import os
from datetime import datetime

In [2]:
# File to load
edc_data_raw = os.path.join("Resources", "EDC LA_LV 2009-2024 Set Times - Combined V2.csv")

In [3]:
# Create dataframe
edc_data_df = pd.read_csv(edc_data_raw) 
edc_data_df.head(10)

Unnamed: 0,Year,Date,Day,Location,Stage,Time,Artist
0,2009,"June 26, 2009",Friday,Los Angeles,kineticFIELD,3pm-4pm,Joaquin Bamaca
1,2009,"June 26, 2009",Friday,Los Angeles,kineticFIELD,4pm-6pm,Aly & Fila
2,2009,"June 26, 2009",Friday,Los Angeles,kineticFIELD,6pm-8pm,Fedde Le Grand
3,2009,"June 26, 2009",Friday,Los Angeles,kineticFIELD,8pm-9:30pm,Eddie Halliwell
4,2009,"June 26, 2009",Friday,Los Angeles,kineticFIELD,9:45pm-10:45pm,Thievery Corporation
5,2009,"June 26, 2009",Friday,Los Angeles,kineticFIELD,11pm-12:30am,ATB
6,2009,"June 26, 2009",Friday,Los Angeles,kineticFIELD,12:30am-2am,Paul Oakenfold
7,2009,"June 26, 2009",Friday,Los Angeles,circuitGROUNDS,4pm-5:30pm,Manufactured Superstars
8,2009,"June 26, 2009",Friday,Los Angeles,circuitGROUNDS,5:30pm-7:15pm,Patricio
9,2009,"June 26, 2009",Friday,Los Angeles,circuitGROUNDS,7:15pm-8:15pm,Daedalus


In [4]:
# Check for nulls, count rows, check data type
print(f"Nulls:\n{edc_data_df.isna().sum()} \n") # no nulls
print(f"Row count:\n{edc_data_df.notnull().sum()} \n") # 3111 rows
print(f"Data Types:\n{edc_data_df.dtypes}")

Nulls:
Year        0
Date        0
Day         0
Location    0
Stage       0
Time        0
Artist      0
dtype: int64 

Row count:
Year        3111
Date        3111
Day         3111
Location    3111
Stage       3111
Time        3111
Artist      3111
dtype: int64 

Data Types:
Year         int64
Date        object
Day         object
Location    object
Stage       object
Time        object
Artist      object
dtype: object


In [6]:
# Check the values of the columns
print(f"Years:\n{set(edc_data_df['Year'])} \n")
print(f"Dates:\n{set(edc_data_df['Date'])} \n")
print(f"Days:\n{set(edc_data_df['Day'])} \n")  # no issues
print(f"Locations:\n{set(edc_data_df['Location'])} \n") # no issues
print(f"Stages:\n{set(edc_data_df['Stage'])} \n")

Years:
{2016, 2017, 2018, 2019, 2021, 2022, 2023, 2024, 2009, 2010, 2011, 2012, 2013, 2014, 2015} 

Dates:
{'October 22, 2021', 'June 21, 2014', 'May 17, 2024', 'May 20, 2018', 'October 24, 2021', 'June 22, 2014', 'June 8, 2012', 'June 24, 2011', 'June 17, 2017', 'May 19, 2024', 'May 18, 2019', 'June 23, 2013', 'June 22, 2013', 'June 20, 2014', 'May 18, 2024', 'June 19, 2015', 'May 19, 2019', 'June 26, 2010', 'May 20, 2022', 'May 21, 2023', 'June 26, 2011', 'June 19, 2016', 'May 19, 2018', 'June 18, 2017', 'May 17, 2019', 'June 9, 2012', 'June 20, 2015', 'June 18, 2016', 'May 21, 2022', 'June 25, 2011', 'June 21, 2013', 'June 25, 2010', 'June 21, 2015', 'May 20, 2023', 'June 17, 2016', 'May 19, 2023', 'May 18, 2018', 'June 26, 2009', 'June 27, 2009', 'June 16, 2017', 'May 22, 2022', 'June 10, 2012', 'October 23, 2021'} 

Days:
{'Saturday', 'Sunday', 'Friday'} 

Locations:
{'Las Vegas', 'Los Angeles'} 

Stages:
{'neonGarden hosted by La La Land', 'neonGARDEN', 'stage 7', 'stage7', 'neon

In [7]:
# Create a copy of the df
edc_data_dfV2 = edc_data_df.copy()

In [8]:
# Check list of stages
edc_data_dfV2[['stage_name','stage_name_ext']] = edc_data_dfV2['Stage'].str.split(" ", n=1, expand=True)
edc_stages_unique_list = sorted(set(edc_data_dfV2['stage_name']))
for i in edc_stages_unique_list:
    print(i);

Q-dance
bassCON
bassPOD
bassPod
bionicJungle
circuitGROUNDS
circuitGrounds
cosmicMEADOW
cosmicMeadow
discoveryPROJECT
discoverySTAGE
funkHOUSE
heinekenDOMES
kalliopeARTCAR
kineticFIELD
kineticField
neonGARDEN
neonGarden
quantumVALLEY
quantumValley
stage
stage7
stereoBloom
upsidedownHOUSE
upsidedownHouse
wasteLAND
wasteLand
wide
wideawakeARTCAR


In [None]:
# Function to convert date
def mdy_to_ymd(row):
    return datetime.strptime(row['Date'], '%B %d, %Y').strftime('%Y-%m-%d')

# print(f"Dates After Cleaning:\n{sorted(set(edc_data_dfV2.apply(mdy_to_ymd, axis=1)))}")

# Function to clean Stages
def clean_stages(row):
   if row['stage_name'] == 'bassPod':
      return 'bassPOD'
   elif row['stage_name'] == 'circuitGrounds':
      return 'circuitGROUNDS'
   elif row['stage_name'] == 'cosmicMeadow':
      return 'cosmicMEADOW'
   elif row['stage_name'] == 'kineticField':
      return 'kineticFIELD'
   elif row['stage_name'] == 'neonGarden':
      return 'neonGARDEN'
   elif row['stage_name'] == 'quantumValley':
      return 'quantumVALLEY'
   elif row['stage_name'] == 'stereoBloom':
      return 'stereoBLOOM'
   elif row['stage_name'] == 'upsidedownHouse':
      return 'upsidedownHOUSE'
   elif row['stage_name'] == 'wasteLand':
      return 'wasteLAND'
   elif row['stage_name'] == 'stage':
      return 'stage7'
   elif row['stage_name'] == 'wide':
      return 'wideawakeARTCAR'
   else: 
      return row['stage_name']

In [None]:
# # check the list again 
# sorted(set(edc_data_df.apply(clean_stages, axis=1)))

In [None]:
# edc_data_df