### Part 2
Perform Data Quality Analysis on Beatles data collected in data collection phase

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)



In [2]:
df = pd.read_pickle('BeatlesSongsMerged.pkl')
df.columns


Index(['song', 'core_catalogue_releases', 'songwriters', 'lead_vocals', 'year',
       'SongKey', 'first_release_date', 'max_key', 'mode', 'avg_danceability',
       'avg_energy', 'avg_loudness', 'avg_speechiness', 'avg_acousticness',
       'avg_instrumentalness', 'avg_liveness', 'avg_valence', 'avg_tempo',
       'avg_duration_ms', 'avg_time_signature', 'Song_y', 'lyrics',
       'avg_popularity', 'BB_name', 'BB_artist', 'BB_debut_date',
       'BB_peak_position', 'BB_peak_date', 'BB_weeks_on_chart',
       'RollingStonePosition', 'Top20_x', 'Next30_x', 'Bottom20_x',
       'RobSegment', 'Top20_y', 'Next30_y', 'Bottom20_y', 'LauraSegment',
       'Top20_x', 'Next30_x', 'Bottom20_x', 'EmilySegment', 'Top20_y',
       'Next30_y', 'Bottom20_y', 'OliviaSegment', 'Top20_x', 'Next30_x',
       'Bottom20_x', 'BrianSegment', 'Top20_y', 'Next30_y', 'Bottom20_y',
       'JackieSegment', 'Canonical_album', 'Year_y', 'first_vocalist',
       'Paul_Song', 'John_Song', 'George_Song', 'Ringo_Song',

## Do some basic data quality scans

In [3]:
# Display the first few rows of the DataFrame
##print(df.head())
print(df.head().to_string())


                  song core_catalogue_releases                     songwriters              lead_vocals  year            SongKey first_release_date  max_key  mode  avg_danceability  avg_energy  avg_loudness  avg_speechiness  avg_acousticness  avg_instrumentalness  avg_liveness  avg_valence   avg_tempo  avg_duration_ms  avg_time_signature               Song_y                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [4]:

# 1. Check for missing data
missing = df.isnull().sum()
print("Missing values:\n", missing)



Missing values:
 song                         0
core_catalogue_releases      0
songwriters                  0
lead_vocals                  0
year                         0
SongKey                      0
first_release_date           0
max_key                      9
mode                         9
avg_danceability             9
avg_energy                   9
avg_loudness                 9
avg_speechiness              9
avg_acousticness             9
avg_instrumentalness         9
avg_liveness                 9
avg_valence                  9
avg_tempo                    9
avg_duration_ms              9
avg_time_signature           9
Song_y                      12
lyrics                      12
avg_popularity               4
BB_name                    156
BB_artist                  156
BB_debut_date              156
BB_peak_position           156
BB_peak_date               156
BB_weeks_on_chart          156
RollingStonePosition       189
Top20_x                    195
Next30_x              

In [5]:
missing_avg_danceability = df[df['avg_danceability'].isnull()]
print(missing_avg_danceability)


                                                  song  \
16                                             Bad Boy   
78                                    I Call Your Name   
97                                            I'm Down   
110                       Kansas City/Hey-Hey-Hey-Hey!   
111  Komm, gib mir deine Hand  (German version of I...   
147                                 Please Mr. Postman   
168  Sie liebt dich  (German version of She Loves You)   
202                                          Yes It Is   
205              You Know My Name (Look Up the Number)   

                               core_catalogue_releases  \
16                                        Past Masters   
78                                        Past Masters   
97                      Past Masters (B-side of Help!)   
110                                   Beatles for Sale   
111                                       Past Masters   
147                                   With the Beatles   
168  Past Mas

In [6]:
missing_lyrics = df[df['lyrics'].isnull()]
print(missing_lyrics)




                                                  song  \
36                                    Dizzy Miss Lizzy   
50                                              Flying   
98                    I'm Happy Just to Dance with You   
110                       Kansas City/Hey-Hey-Hey-Hey!   
111  Komm, gib mir deine Hand  (German version of I...   
147                                 Please Mr. Postman   
150                                    P.S. I Love You   
153                                       Revolution 1   
154                                       Revolution 9   
168  Sie liebt dich  (German version of She Loves You)   
178                                    There's a Place   
180                                 Think for Yourself   

                               core_catalogue_releases  \
36                                               Help!   
50                                Magical Mystery Tour   
98                                  A Hard Day's Night   
110          

In [7]:
missing_avg_popularity = df[df['avg_popularity'].isnull()]
print(missing_avg_popularity)
## these will be dropped along with the songs with missing danceability

                                                  song  \
110                       Kansas City/Hey-Hey-Hey-Hey!   
111  Komm, gib mir deine Hand  (German version of I...   
147                                 Please Mr. Postman   
168  Sie liebt dich  (German version of She Loves You)   

                               core_catalogue_releases  \
110                                   Beatles for Sale   
111                                       Past Masters   
147                                   With the Beatles   
168  Past Masters (B-side of Komm, gib mir deine Hand)   

                                           songwriters       lead_vocals  \
110     Jerry Leiber Mike Stoller / Richard Penniman †         McCartney   
111        Lennon McCartney Jean Nicolas Heinz Hellmer  Lennon McCartney   
147  Georgia Dobbins William Garrett Brian Holland ...            Lennon   
168         Lennon McCartney Jean Nicolas Lee Montogue  Lennon McCartney   

     year                            

In [8]:
#drop rows where Spotify song metadata is missing. All of these songs are minor so it shouldn't affect the analysis and it's not worth hunting down a better solution
df = df.dropna(subset=['avg_danceability'])
df = df.dropna(subset=['lyrics'])


In [9]:

# 2. Check for duplicate rows
duplicates = df.duplicated().sum()
print("\nDuplicate rows: ", duplicates)





Duplicate rows:  0


In [10]:
# 3. Descriptive statistics for numeric columns
statistics = df.describe()
print("\nDescriptive Statistics:\n", statistics)




Descriptive Statistics:
           max_key        mode  avg_danceability  avg_energy  avg_loudness  \
count  198.000000  198.000000        198.000000  198.000000    198.000000   
mean     5.994949    0.919192          0.533346    0.549670     -9.991401   
std      3.371269    0.273231          0.125137    0.176141      2.185938   
min      0.000000    0.000000          0.245500    0.153575    -21.674000   
25%      4.000000    1.000000          0.450196    0.426229    -11.214562   
50%      7.000000    1.000000          0.530200    0.557250     -9.652250   
75%      9.000000    1.000000          0.634417    0.692250     -8.573000   
max     11.000000    1.000000          0.806200    0.888000     -6.201000   

       avg_speechiness  avg_acousticness  avg_instrumentalness  avg_liveness  \
count       198.000000        198.000000            198.000000    198.000000   
mean          0.050359          0.361892              0.056720      0.233173   
std           0.030674          0.253176

In [11]:
# 4. Checking unique values in columns
for column in df.columns:
    print("\nUnique values in column ", column, ": ", df[column].nunique())




Unique values in column  song :  198

Unique values in column  core_catalogue_releases :  32

Unique values in column  songwriters :  24

Unique values in column  lead_vocals :  13

Unique values in column  year :  11

Unique values in column  SongKey :  198

Unique values in column  first_release_date :  32

Unique values in column  max_key :  12

Unique values in column  mode :  2

Unique values in column  avg_danceability :  195

Unique values in column  avg_energy :  186

Unique values in column  avg_loudness :  198

Unique values in column  avg_speechiness :  184

Unique values in column  avg_acousticness :  195

Unique values in column  avg_instrumentalness :  138

Unique values in column  avg_liveness :  191

Unique values in column  avg_valence :  191

Unique values in column  avg_tempo :  198

Unique values in column  avg_duration_ms :  198

Unique values in column  avg_time_signature :  24

Unique values in column  Song_y :  198

Unique values in column  lyrics :  198

Uniqu

In [12]:
# 5. Checking the datatypes
datatypes = df.dtypes
print("\nData types:\n", datatypes)


Data types:
 song                               object
core_catalogue_releases            object
songwriters                        object
lead_vocals                        object
year                               object
SongKey                            object
first_release_date         datetime64[ns]
max_key                           float64
mode                              float64
avg_danceability                  float64
avg_energy                        float64
avg_loudness                      float64
avg_speechiness                   float64
avg_acousticness                  float64
avg_instrumentalness              float64
avg_liveness                      float64
avg_valence                       float64
avg_tempo                         float64
avg_duration_ms                   float64
avg_time_signature                float64
Song_y                             object
lyrics                             object
avg_popularity                    float64
BB_name             

In [13]:
df['first_release_date'] = pd.to_datetime(df['first_release_date'])
df['LauraSegment'] = df['LauraSegment'].astype('int64')
df['RobSegment'] = df['RobSegment'].astype('int64')
df['EmilySegment'] = df['EmilySegment'].astype('int64')
df['OliviaSegment'] = df['OliviaSegment'].astype('int64')
df['BrianSegment'] = df['BrianSegment'].astype('int64')
df['JackieSegment'] = df['JackieSegment'].astype('int64')
df['year'] = df['year'].astype('int64')






IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [14]:

df.dtypes

song                               object
core_catalogue_releases            object
songwriters                        object
lead_vocals                        object
year                               object
SongKey                            object
first_release_date         datetime64[ns]
max_key                           float64
mode                              float64
avg_danceability                  float64
avg_energy                        float64
avg_loudness                      float64
avg_speechiness                   float64
avg_acousticness                  float64
avg_instrumentalness              float64
avg_liveness                      float64
avg_valence                       float64
avg_tempo                         float64
avg_duration_ms                   float64
avg_time_signature                float64
Song_y                             object
lyrics                             object
avg_popularity                    float64
BB_name                           

In [15]:
print(df.shape)

(198, 63)


In [16]:

print(df.head(10).to_string())


                     song core_catalogue_releases                     songwriters              lead_vocals  year             SongKey first_release_date  max_key  mode  avg_danceability  avg_energy  avg_loudness  avg_speechiness  avg_acousticness  avg_instrumentalness  avg_liveness  avg_valence   avg_tempo  avg_duration_ms  avg_time_signature                  Song_y                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

In [17]:
#write out the pickle file after cleaning
pd.to_pickle(df, 'BeatlesSongsQCd.pkl')