In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [18]:
artists = pd.read_csv('../data/artist.csv')
songs = pd.read_csv('../data/songs.csv')

artists = artists.iloc[1: , :]
songs = songs.iloc[1: , :]

In [19]:
artists.shape

(135, 4)

In [20]:
songs.shape

(2010, 7)

# Artists

In [21]:
artists.head()

Unnamed: 0,artist,description,images_url,followers_count
1,Post Malone,"Austin Richard Post (born July 4, 1995), bette...",https://images.genius.com/1010194fa644be099aa2...,4616.0
2,The Weeknd,"Abel Makkonen Tesfaye (born February 16, 1990 ...",https://images.genius.com/f0813e600d43b8b43c94...,8358.0
3,Roddy Ricch,"Rodrick Wayne Moore, Jr. (b. October 22, 1998)...",https://images.genius.com/78cf8142bcc24004ee49...,751.0
4,Rod Wave,"Rodarius Marcell Green (born August 27, 1998),...",https://images.genius.com/4a800d74ab2eb5821156...,265.0
5,Doja Cat,Amala Ratna Zandile Dlamini (born on October 2...,https://images.genius.com/2686a106bac2a8a7c1e9...,1156.0


# Songs

In [22]:
songs.head()

Unnamed: 0,song_name,artist,lyrics,description,accepted_annotations,contributors,pageviews
1,​​rockstar,Post Malone,"[Intro: Post Malone] Hahahahaha Tank God Ayy, ...","On “rockstar,” Post Malone compares his habits...",18.0,440.0,6884752.0
2,White Iverson,Post Malone,[Intro] Double OT I'm a new three [Chorus] Sau...,"Virtually unknown before this track, Post Malo...",31.0,222.0,4145011.0
3,Congratulations,Post Malone,"[Intro: Post Malone] Mm-mmm Yeah, yeah Mm-mmm ...","On “Congratulations,“ Post Malone and Quavo ce...",13.0,222.0,3634846.0
4,Psycho,Post Malone,"[Chorus: Post Malone] Damn, my AP goin' psycho...",“Psycho” is the third single from Post Malone’...,21.0,230.0,3082328.0
5,I Fall Apart,Post Malone,"[Intro] Ooh, I fall apart Ooh, yeah, mmm, yeah...","On “I Fall Apart,” Post reminisces about a hea...",7.0,126.0,2640796.0


# Data Cleaning: Artist Gender

In [23]:
# Function to Count How Many Gendered Words Appear in Description 
def count_male(row):
    num_male = 0
    male_words = [' his ',' he ',' him ',' himself ',' man ',' actor ']
    for word in male_words:
        num_male += row['description'].lower().count(word)
    return num_male

def count_female(row):
    num_female = 0
    female_words = [' her ',' hers ',' she ',' herself ',' woman ',' actress ']
    for word in female_words:
        num_female += row['description'].lower().count(word)
    return num_female

artists['male_count'] = artists.apply(count_male, axis = 1)
artists['female_count'] = artists.apply(count_female, axis = 1)

In [24]:
# Function to Get Artist Gender from Gender Word Counts
def get_gender(row):
    if row['male_count'] > row['female_count']:
        return "Male"
    elif row['female_count'] > row['male_count']:
        return "Female"
    else:
        return None
    
artists['gender'] = artists.apply(get_gender, axis = 1)

In [25]:
# Catching missed genders
missed = {'Gabby Barrett':'Female', 'Sam Smith':'Non Binary', 'Demi Lovato':'Non Binary'}
art_miss = ['Gabby Barrett','Sam Smith','Demi Lovato']
artists["missed"] = artists["artist"].map(missed)

In [26]:
#Correcting mis-gendered artists
def correct_gender(row):
    if row['artist'] in art_miss:
        return row['missed']
    else:
        return row['gender']
    
artists['gender'] = artists.apply(correct_gender, axis = 1)
artists = artists.drop(columns=['male_count', 'female_count','missed'])

# Data Cleaning: Artist Age

Most birth years are recorded like this: '(b. December 6, 1995)' towards the begining of the description

I am looking for the first occurance of the pattern ' ####)' to get artist birth year

In [27]:
# getting first year that occirs in text and checking that its not too recent
def birth_year(row):
    years_list = re.findall(r"\d\d\d\d", row['description'])
    if not years_list:
        return None
    else:
        if int(years_list[0]) < 2004:
            return int(years_list[0])
        else:
            return None
    
artists['birth_year'] = artists.apply(birth_year, axis = 1)

In [28]:
# Catching missed birth years
missed_year = {'Taylor Swift': 1989,'Luke Combs': 1990,
               'Chris Brown': 1989, 'Selena Gomez': 1992,
               'Maren Morris': 1990,'Halsey': 1994,
               'Tones and I': 2000,'Ed Sheeran': 1991,
               'Gabby Barrett': 2000,'Lady Gaga': 1986,
               'SAINt JHN': 1986,'Trevor Daniel': 1994,
               'Jason Aldean': 1977,'Arizona Zervas': 1995,
               '\u200bblackbear': 1990,'Moneybagg Yo': 1999,
               'Machine Gun Kelly': 1990,'NF': 1991,'Kenny Chesney': 1968,
               'Lana Del Rey': 1985,'Jason Derulo': 1989,'Ashe': 1993}

year_miss = ['Taylor Swift','Luke Combs','Chris Brown','Selena Gomez',
             'Maren Morris','Halsey','Tones and I','Ed Sheeran',
             'Gabby Barrett','Lady Gaga','SAINt JHN','Trevor Daniel',
             'Jason Aldean','Arizona Zervas','\u200bblackbear','Moneybagg Yo',
             'Machine Gun Kelly','NF','Kenny Chesney',
             'Lana Del Rey','Jason Derulo','Ashe']

artists["missed_year"] = artists["artist"].map(missed_year)

In [29]:
#Correcting mis-gendered artists
def correct_year(row):
    if row['artist'] in year_miss:
        return row['missed_year']
    else:
        return row['birth_year']
    
artists['birth_year'] = artists.apply(correct_year, axis = 1)
artists = artists.drop(columns=['missed_year'])

In [30]:
#Getting Decade Born
def decade(row):
    if row['birth_year'] in [1960,1961,1962,1963,1964,1965,1966,1967,1968,1969]:
        return "1960s"
    elif row['birth_year'] in [1970,1971,1972,1973,1974,1975,1976,1977,1978,1979]:
        return "1970s"
    elif row['birth_year'] in [1980,1981,1982,1983,1984,1985,1986,1987,1988,1989]:
        return "1980s"
    elif row['birth_year'] in [1990,1991,1992,1993,1994,1995,1996,1997,1998,1999]:
        return "1990s"
    elif row['birth_year'] in [2000,2001,2002,2003,2004]:
        return "2000s"
    else:
        return None
    
artists['birth_decade'] = artists.apply(decade, axis = 1)

In [31]:
# making buckets by 5 years
def five_years(row):
    if row['birth_year'] in [1960,1961,1962,1963,1964]:
        return "1960-1964"
    elif row['birth_year'] in [1965,1966,1967,1968,1969]:
        return "1965-1969"
    elif row['birth_year'] in [1970,1971,1972,1973,1974]:
        return "1970-1974"
    elif row['birth_year'] in [1975,1976,1977,1978,1979]:
        return "1975-1979"
    elif row['birth_year'] in [1980,1981,1982,1983,1984]:
        return "1980-1984"
    elif row['birth_year'] in [1985,1986,1987,1988,1989]:
        return "1985-1989"
    elif row['birth_year'] in [1990,1991,1992,1993,1994]:
        return "1990-1994"
    elif row['birth_year'] in [1995,1996,1997,1998,1999]:
        return "1995-1999"
    elif row['birth_year'] in [2000,2001,2002,2003,2004]:
        return "2000-2004"
    else:
        return None
    
artists['birth_5_yr'] = artists.apply(five_years, axis = 1)

# Data Cleaning: Genre

# Exporting Cleaned Data

In [32]:
artists.to_csv('../data/artist_cleaned.csv', index=False)
songs.to_csv('../data/songs_cleaned.csv', index=False)