### Part 2
Perform Data Quality Analysis on Beatles data collected in data collection phase

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)



In [21]:
df = pd.read_pickle('BeatlesSongsMerged.pkl')
df.columns


Index(['song', 'core_catalogue_releases', 'songwriters', 'lead_vocals', 'year',
       'SongKey', 'first_release_date', 'max_key', 'mode', 'avg_danceability',
       'avg_energy', 'avg_loudness', 'avg_speechiness', 'avg_acousticness',
       'avg_instrumentalness', 'avg_liveness', 'avg_valence', 'avg_tempo',
       'avg_duration_ms', 'avg_time_signature', 'lyrics', 'avg_popularity',
       'BB_name', 'BB_artist', 'BB_debut_date', 'BB_peak_position',
       'BB_peak_date', 'BB_weeks_on_chart', 'RollingStonePosition',
       'RobSegment', 'LauraSegment', 'EmilySegment', 'OliviaSegment',
       'BrianSegment', 'JackieSegment', 'Streams', 'Daily', 'Canonical_album',
       'first_vocalist', 'Paul_Song', 'John_Song', 'George_Song', 'Ringo_Song',
       'cover', 'Era'],
      dtype='object')

## Do some basic data quality scans

In [22]:
# Display the first few rows of the DataFrame
##print(df.head())
print(df.head().to_string())


                  song core_catalogue_releases                     songwriters              lead_vocals  year            SongKey first_release_date  max_key  mode  avg_danceability  avg_energy  avg_loudness  avg_speechiness  avg_acousticness  avg_instrumentalness  avg_liveness  avg_valence   avg_tempo  avg_duration_ms  avg_time_signature                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [23]:

# 1. Check for missing data
missing = df.isnull().sum()
print("Missing values:\n", missing)



Missing values:
 song                         0
core_catalogue_releases      0
songwriters                  0
lead_vocals                  0
year                         0
SongKey                      0
first_release_date           0
max_key                     11
mode                        11
avg_danceability            11
avg_energy                  11
avg_loudness                11
avg_speechiness             11
avg_acousticness            11
avg_instrumentalness        11
avg_liveness                11
avg_valence                 11
avg_tempo                   11
avg_duration_ms             11
avg_time_signature          11
lyrics                      12
avg_popularity               4
BB_name                    156
BB_artist                  156
BB_debut_date              156
BB_peak_position           156
BB_peak_date               156
BB_weeks_on_chart          156
RollingStonePosition       189
RobSegment                   0
LauraSegment                 0
EmilySegment          

In [24]:
missing_avg_danceability = df[df['avg_danceability'].isnull()]
print(missing_avg_danceability)


                                                  song  \
16                                             Bad Boy   
31                                         Day Tripper   
78                                    I Call Your Name   
97                                            I'm Down   
110                       Kansas City/Hey-Hey-Hey-Hey!   
111  Komm, gib mir deine Hand  (German version of I...   
147                                 Please Mr. Postman   
168  Sie liebt dich  (German version of She Loves You)   
188                                 We Can Work It Out   
202                                          Yes It Is   
205              You Know My Name (Look Up the Number)   

                               core_catalogue_releases  \
16                                        Past Masters   
31   Past Masters (Double A-side with We Can Work I...   
78                                        Past Masters   
97                      Past Masters (B-side of Help!)   
110          

In [25]:
missing_lyrics = df[df['lyrics'].isnull()]
print(missing_lyrics)




                                                  song  \
36                                    Dizzy Miss Lizzy   
50                                              Flying   
98                    I'm Happy Just to Dance with You   
110                       Kansas City/Hey-Hey-Hey-Hey!   
111  Komm, gib mir deine Hand  (German version of I...   
147                                 Please Mr. Postman   
150                                    P.S. I Love You   
153                                       Revolution 1   
154                                       Revolution 9   
168  Sie liebt dich  (German version of She Loves You)   
178                                    There's a Place   
180                                 Think for Yourself   

                               core_catalogue_releases  \
36                                               Help!   
50                                Magical Mystery Tour   
98                                  A Hard Day's Night   
110          

In [26]:
missing_avg_popularity = df[df['avg_popularity'].isnull()]
print(missing_avg_popularity)
## these will be dropped along with the songs with missing danceability

                                                  song  \
110                       Kansas City/Hey-Hey-Hey-Hey!   
111  Komm, gib mir deine Hand  (German version of I...   
147                                 Please Mr. Postman   
168  Sie liebt dich  (German version of She Loves You)   

                               core_catalogue_releases  \
110                                   Beatles for Sale   
111                                       Past Masters   
147                                   With the Beatles   
168  Past Masters (B-side of Komm, gib mir deine Hand)   

                                           songwriters       lead_vocals  \
110     Jerry Leiber Mike Stoller / Richard Penniman †         McCartney   
111        Lennon McCartney Jean Nicolas Heinz Hellmer  Lennon McCartney   
147  Georgia Dobbins William Garrett Brian Holland ...            Lennon   
168         Lennon McCartney Jean Nicolas Lee Montogue  Lennon McCartney   

     year                            

In [27]:
#drop rows where Spotify song metadata is missing. All of these songs are minor so it shouldn't affect the analysis and it's not worth hunting down a better solution
df = df.dropna(subset=['avg_danceability'])
df = df.dropna(subset=['lyrics'])


In [28]:

# 2. Check for duplicate rows
duplicates = df.duplicated().sum()
print("\nDuplicate rows: ", duplicates)





Duplicate rows:  0


In [29]:
# 3. Descriptive statistics for numeric columns
statistics = df.describe()
print("\nDescriptive Statistics:\n", statistics)




Descriptive Statistics:
           max_key        mode  avg_danceability  avg_energy  avg_loudness  \
count  196.000000  196.000000        196.000000  196.000000    196.000000   
mean     5.989796    0.923469          0.532667    0.544361    -10.135418   
std      3.364962    0.266526          0.125867    0.173924      2.144649   
min      0.000000    0.000000          0.245500    0.153575    -21.674000   
25%      4.000000    1.000000          0.451414    0.425312    -11.332625   
50%      7.000000    1.000000          0.530200    0.553286     -9.835583   
75%      9.000000    1.000000          0.632750    0.680167     -8.752750   
max     11.000000    1.000000          0.806200    0.888000     -6.201000   

       avg_speechiness  avg_acousticness  avg_instrumentalness  avg_liveness  \
count       196.000000        196.000000            196.000000    196.000000   
mean          0.051194          0.368506              0.058036      0.238097   
std           0.031757          0.255087

In [30]:
# 4. Checking unique values in columns
for column in df.columns:
    print("\nUnique values in column ", column, ": ", df[column].nunique())




Unique values in column  song :  196

Unique values in column  core_catalogue_releases :  30

Unique values in column  songwriters :  24

Unique values in column  lead_vocals :  13

Unique values in column  year :  11

Unique values in column  SongKey :  196

Unique values in column  first_release_date :  31

Unique values in column  max_key :  12

Unique values in column  mode :  2

Unique values in column  avg_danceability :  191

Unique values in column  avg_energy :  187

Unique values in column  avg_loudness :  196

Unique values in column  avg_speechiness :  181

Unique values in column  avg_acousticness :  192

Unique values in column  avg_instrumentalness :  135

Unique values in column  avg_liveness :  188

Unique values in column  avg_valence :  189

Unique values in column  avg_tempo :  196

Unique values in column  avg_duration_ms :  196

Unique values in column  avg_time_signature :  23

Unique values in column  lyrics :  196

Unique values in column  avg_popularity :  12

In [31]:
# 5. Checking the datatypes
datatypes = df.dtypes
print("\nData types:\n", datatypes)


Data types:
 song                               object
core_catalogue_releases            object
songwriters                        object
lead_vocals                        object
year                               object
SongKey                            object
first_release_date         datetime64[ns]
max_key                           float64
mode                              float64
avg_danceability                  float64
avg_energy                        float64
avg_loudness                      float64
avg_speechiness                   float64
avg_acousticness                  float64
avg_instrumentalness              float64
avg_liveness                      float64
avg_valence                       float64
avg_tempo                         float64
avg_duration_ms                   float64
avg_time_signature                float64
lyrics                             object
avg_popularity                    float64
BB_name                            object
BB_artist           

In [32]:
df['first_release_date'] = pd.to_datetime(df['first_release_date'])
df['LauraSegment'] = df['LauraSegment'].astype('int64')
df['RobSegment'] = df['RobSegment'].astype('int64')
df['EmilySegment'] = df['EmilySegment'].astype('int64')
df['OliviaSegment'] = df['OliviaSegment'].astype('int64')
df['BrianSegment'] = df['BrianSegment'].astype('int64')
df['JackieSegment'] = df['JackieSegment'].astype('int64')
df['year'] = df['year'].astype('int64')






In [39]:
real_love_df = df[df['song'] == 'Real Love']
real_love_df

Unnamed: 0,song,core_catalogue_releases,songwriters,lead_vocals,year,SongKey,first_release_date,max_key,mode,avg_danceability,avg_energy,avg_loudness,avg_speechiness,avg_acousticness,avg_instrumentalness,avg_liveness,avg_valence,avg_tempo,avg_duration_ms,avg_time_signature,lyrics,avg_popularity,BB_name,BB_artist,BB_debut_date,BB_peak_position,BB_peak_date,BB_weeks_on_chart,RollingStonePosition,RobSegment,LauraSegment,EmilySegment,OliviaSegment,BrianSegment,JackieSegment,Streams,Daily,Canonical_album,first_vocalist,Paul_Song,John_Song,George_Song,Ringo_Song,cover,Era
213,Real Love,Anthology 2,Lennon,Lennon,1996,reallove,1996-03-04,8.0,1.0,0.375,0.694,-7.334,0.031,0.0458,0.019,0.257,0.405,175.726,234053.0,4.0,"All my little plans and schemes Lost like some forgotten dream Seems like all I really was doing Was waiting for you Just like little girls and boys Playing with their little toys Seems like all they really were doing Was waiting for you Don't need to be alone No need to be alone It's real love It's real, yes it's real love It's real From this moment on I know Exactly where my life will go Seems that all I really was doing Was waiting for love Don't need to be afraid No need to be afraid It's real love It's real, yes it's real love It's real Thought I'd been in love before, But in my heart I wanted more Seems like all I really was doing Was waiting for you Don't need to be alone No need to be alone It's real love Yes it's real, yes it's real love It's real, yes it's real love...",53.0,Real Love,The Beatles,1996-03-23,11,1996-03-23,7,,4,3,3,3,3,3,12255131.0,6996.0,Anthology 2,Lennon,0,1,0,0,0,Group of Solo Artists


In [35]:

df.dtypes

song                               object
core_catalogue_releases            object
songwriters                        object
lead_vocals                        object
year                                int64
SongKey                            object
first_release_date         datetime64[ns]
max_key                           float64
mode                              float64
avg_danceability                  float64
avg_energy                        float64
avg_loudness                      float64
avg_speechiness                   float64
avg_acousticness                  float64
avg_instrumentalness              float64
avg_liveness                      float64
avg_valence                       float64
avg_tempo                         float64
avg_duration_ms                   float64
avg_time_signature                float64
lyrics                             object
avg_popularity                    float64
BB_name                            object
BB_artist                         

In [36]:
print(df.shape)

(196, 45)


In [37]:

print(df.head(10).to_string())


                     song core_catalogue_releases                     songwriters              lead_vocals  year             SongKey first_release_date  max_key  mode  avg_danceability  avg_energy  avg_loudness  avg_speechiness  avg_acousticness  avg_instrumentalness  avg_liveness  avg_valence   avg_tempo  avg_duration_ms  avg_time_signature                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [38]:
#write out the pickle file after cleaning
pd.to_pickle(df, 'BeatlesSongsQCd.pkl')