# This notebook contains code that applies initial cleaning to the dataset

In [2]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 20000)
pd.set_option('display.max_columns', 500)

### Clean the randomly picked songs - random songs from the same time period used in analysis


In [3]:
df = pd.read_csv('random_picked_songs.csv')

In [4]:
# retain only those songs that did not appear on billboard
df = df[df.bb_appear == 0]

In [44]:
# eliminating discrepancies in release date field
df.release_date = df.release_date.apply(lambda x: x if len(x) > 4 else 'n')
df = df[df.release_date != 'n']
df['release_date'] = df.release_date.apply(lambda x: x.split('/')[0] + '/' +
                                            x.split('/')[1] + '/20' +
                                            x.split('/')[2]
                                            if int(x.split('/')[2]) < 20
                                            else 'n')

In [47]:
# calculating days since release
def calc_days_since_release(x):
    d1 = datetime.today()
    d2 = datetime.strptime(x, "%m/%d/%Y")
    return abs((d2 - d1).days)

df['days_since_release'] = df.release_date.apply(calc_days_since_release)

In [54]:
# retaining only required columns
df = df[['acousticness', 'album', 'artist',
       'artist_followers', 'artist_popularity', 'danceability', 'duration',
       'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'n_markets', 'speechiness', 'tempo', 'time_signature',
       'track_name', 'popularity', 'valence', 'n_weeks',
       'days_since_release', 'bb_appear']]

In [194]:
#getting only songs released since 2015 and transforming incorrect dates
df_t = pd.DataFrame()
for index, row in df.iterrows():
    rl = row['release_date']
    if (('2015' in rl) |
        ('2016' in rl) |
        ('2017' in rl) |
        ('2018' in rl)
       ):
        df_t = df_t.append(row)

In [198]:
# calcualting days since release
df_t['days_since_release'] = df_t.release_date.apply(calc_days_since_release)

In [200]:
df_t = df_t.drop(['Unnamed: 0', 'analysis_url', 'album_id',
                  'artist_id', 'duration_ms', 'record_label', 'release_date', 'track_id'], axis = 1)

In [201]:
df_t['bb_appear'] = df_t['weeks'].apply(lambda x: '1' if x != 0 else '0')

In [202]:
# renaming features and saving
df = df.rename(columns = {'n_weeks': 'weeks', 'album': 'album_name', 'popularity': 'track_popularity'})
df.to_csv('random_picked_songs.csv')

(4812, 25)