# Spotify Top 50 Tracks Data Processing with NumPy and Pandas

## Loading the data using Pandas

In [1]:
import pandas as pd

# Load the dataset
spotify_df = pd.read_csv('spotifytoptracks.csv')
spotify_df = spotify_df.drop(columns=['Unnamed: 0'])

# Display the first few rows of the dataframe
spotify_df.head()

Unnamed: 0,artist,album,track_name,track_id,energy,danceability,key,loudness,acousticness,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,genre
0,The Weeknd,After Hours,Blinding Lights,0VjIjW4GlUZAMYd2vXMi3b,0.73,0.514,1,-5.934,0.00146,0.0598,9.5e-05,0.0897,0.334,171.005,200040,R&B/Soul
1,Tones And I,Dance Monkey,Dance Monkey,1rgnBhdG2JDFTbYkYRZAku,0.593,0.825,6,-6.401,0.688,0.0988,0.000161,0.17,0.54,98.078,209755,Alternative/Indie
2,Roddy Ricch,Please Excuse Me For Being Antisocial,The Box,0nbXyq5TXYPCO7pr3N8S4I,0.586,0.896,10,-6.687,0.104,0.0559,0.0,0.79,0.642,116.971,196653,Hip-Hop/Rap
3,SAINt JHN,Roses (Imanbek Remix),Roses - Imanbek Remix,2Wo6QQD1KMDWeFkkjLqwx5,0.721,0.785,8,-5.457,0.0149,0.0506,0.00432,0.285,0.894,121.962,176219,Dance/Electronic
4,Dua Lipa,Future Nostalgia,Don't Start Now,3PfIrDoz19wz7qK7tYeu62,0.793,0.793,11,-4.521,0.0123,0.083,0.0,0.0951,0.679,123.95,183290,Nu-disco


## Data Cleaning for Handling missing values

In [2]:
# Check for missing values
missing_values = spotify_df.isnull().sum()

# Display the missing values count per column
print("Missing values in each column:")
print(missing_values)

# Drop rows with missing values (if any) spotify_cleaned_df = spotify_df.dropna()
# Alternatively, you could fill them with an appropriate value using spotify_df.fillna()

spotify_df.head()

Missing values in each column:
artist              0
album               0
track_name          0
track_id            0
energy              0
danceability        0
key                 0
loudness            0
acousticness        0
speechiness         0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
genre               0
dtype: int64


Unnamed: 0,artist,album,track_name,track_id,energy,danceability,key,loudness,acousticness,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,genre
0,The Weeknd,After Hours,Blinding Lights,0VjIjW4GlUZAMYd2vXMi3b,0.73,0.514,1,-5.934,0.00146,0.0598,9.5e-05,0.0897,0.334,171.005,200040,R&B/Soul
1,Tones And I,Dance Monkey,Dance Monkey,1rgnBhdG2JDFTbYkYRZAku,0.593,0.825,6,-6.401,0.688,0.0988,0.000161,0.17,0.54,98.078,209755,Alternative/Indie
2,Roddy Ricch,Please Excuse Me For Being Antisocial,The Box,0nbXyq5TXYPCO7pr3N8S4I,0.586,0.896,10,-6.687,0.104,0.0559,0.0,0.79,0.642,116.971,196653,Hip-Hop/Rap
3,SAINt JHN,Roses (Imanbek Remix),Roses - Imanbek Remix,2Wo6QQD1KMDWeFkkjLqwx5,0.721,0.785,8,-5.457,0.0149,0.0506,0.00432,0.285,0.894,121.962,176219,Dance/Electronic
4,Dua Lipa,Future Nostalgia,Don't Start Now,3PfIrDoz19wz7qK7tYeu62,0.793,0.793,11,-4.521,0.0123,0.083,0.0,0.0951,0.679,123.95,183290,Nu-disco


## Removing duplicate samples and features

In [3]:
# Define the subset of columns to check for duplicate rows
row_subset = ['track_name', 'artist']

# Check for duplicate rows based on the subset of columns
duplicate_rows = spotify_df.duplicated(subset=row_subset).sum()
print(f"Number of duplicate rows based on subset {row_subset}: {duplicate_rows}")

#To remove duplicate rows
spotify_df = spotify_df.drop_duplicates(subset=row_subset) 

# Define the subset of columns to check for duplicate columns
column_subset = ['energy', 'danceability', 'tempo']

# Check for duplicate columns within the defined subset
duplicate_columns = spotify_df[column_subset].T.duplicated().sum()
print(f"Number of duplicate columns based on subset {column_subset}: {duplicate_columns}")

# If any duplicate columns exist, remove them
if duplicate_columns > 0:
    # Remove duplicate columns only from the subset and retain unique columns
    unique_column_subset = spotify_df[column_subset].T.drop_duplicates().T.columns
    
    # Update the original dataframe to keep the unique subset of columns and the rest of the columns
    # Select the unique columns from the subset and concatenate them with the remaining non-subset columns
    remaining_columns = spotify_df.columns.difference(column_subset)
    spotify_df = spotify_df[remaining_columns.union(unique_column_subset, sort=False)]

# Display the cleaned dataframe after removing duplicates
spotify_df.head()


Number of duplicate rows based on subset ['track_name', 'artist']: 0
Number of duplicate columns based on subset ['energy', 'danceability', 'tempo']: 0


Unnamed: 0,artist,album,track_name,track_id,energy,danceability,key,loudness,acousticness,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,genre
0,The Weeknd,After Hours,Blinding Lights,0VjIjW4GlUZAMYd2vXMi3b,0.73,0.514,1,-5.934,0.00146,0.0598,9.5e-05,0.0897,0.334,171.005,200040,R&B/Soul
1,Tones And I,Dance Monkey,Dance Monkey,1rgnBhdG2JDFTbYkYRZAku,0.593,0.825,6,-6.401,0.688,0.0988,0.000161,0.17,0.54,98.078,209755,Alternative/Indie
2,Roddy Ricch,Please Excuse Me For Being Antisocial,The Box,0nbXyq5TXYPCO7pr3N8S4I,0.586,0.896,10,-6.687,0.104,0.0559,0.0,0.79,0.642,116.971,196653,Hip-Hop/Rap
3,SAINt JHN,Roses (Imanbek Remix),Roses - Imanbek Remix,2Wo6QQD1KMDWeFkkjLqwx5,0.721,0.785,8,-5.457,0.0149,0.0506,0.00432,0.285,0.894,121.962,176219,Dance/Electronic
4,Dua Lipa,Future Nostalgia,Don't Start Now,3PfIrDoz19wz7qK7tYeu62,0.793,0.793,11,-4.521,0.0123,0.083,0.0,0.0951,0.679,123.95,183290,Nu-disco


## Treating the outliers

In [4]:
import numpy as np

# Identify numerical columns (excluding non-numeric data like song names)
numerical_cols = spotify_df.select_dtypes(include=[np.number]).columns

# Function to treat outliers using IQR method  
# The IQR method is chosen because it helps manage outliers by capping extreme values, 
# ensuring we don't disproportionately affect the analysis.
def treat_outliers(df, col):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define the outlier bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Remove outliers by capping them at the lower and upper bounds
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

# Apply outlier treatment for each numerical column
for col in numerical_cols:
    treat_outliers(spotify_df, col)

# Display the dataset after treating outliers
spotify_df.head()

Unnamed: 0,artist,album,track_name,track_id,energy,danceability,key,loudness,acousticness,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,genre
0,The Weeknd,After Hours,Blinding Lights,0VjIjW4GlUZAMYd2vXMi3b,0.73,0.514,1.0,-5.934,0.00146,0.0598,5.1e-05,0.0897,0.334,171.005,200040.0,R&B/Soul
1,Tones And I,Dance Monkey,Dance Monkey,1rgnBhdG2JDFTbYkYRZAku,0.593,0.825,6.0,-6.401,0.667675,0.0988,5.1e-05,0.17,0.54,98.078,209755.0,Alternative/Indie
2,Roddy Ricch,Please Excuse Me For Being Antisocial,The Box,0nbXyq5TXYPCO7pr3N8S4I,0.586,0.896,10.0,-6.687,0.104,0.0559,0.0,0.5372,0.642,116.971,196653.0,Hip-Hop/Rap
3,SAINt JHN,Roses (Imanbek Remix),Roses - Imanbek Remix,2Wo6QQD1KMDWeFkkjLqwx5,0.721,0.785,8.0,-5.457,0.0149,0.0506,5.1e-05,0.285,0.894,121.962,176219.0,Dance/Electronic
4,Dua Lipa,Future Nostalgia,Don't Start Now,3PfIrDoz19wz7qK7tYeu62,0.793,0.793,11.0,-4.521,0.0123,0.083,0.0,0.0951,0.679,123.95,183290.0,Nu-disco


## Number of obserations are there in this dataset

In [5]:
# Get the number of observations (rows) and features (columns)
num_observations, num_features = spotify_df.shape
print(f"Number of observations (rows) in the dataset: {num_observations}")

Number of observations (rows) in the dataset: 50


## Number of features in the dataset

In [6]:
# Get the number of features (columns)
num_observations, num_features = spotify_df.shape
print(f"Number of features (columns) in the dataset: {num_features}")

Number of features (columns) in the dataset: 16


## Categorial features

In [7]:
# Identify categorical columns (usually with data type 'object' or 'category')
categorical_columns = spotify_df.select_dtypes(include=['object', 'category']).columns
print("Categorical features in the dataset:", categorical_columns)

Categorical features in the dataset: Index(['artist', 'album', 'track_name', 'track_id', 'genre'], dtype='object')


## Numeric features

In [8]:
# Identify numeric columns (with data types 'int64' or 'float64')
numeric_columns = spotify_df.select_dtypes(include=['int64', 'float64']).columns
print("Numeric features in the dataset:", numeric_columns)

Numeric features in the dataset: Index(['energy', 'danceability', 'key', 'loudness', 'acousticness',
       'speechiness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms'],
      dtype='object')


## Artists more than one popular track

In [9]:
# Group by artist and count the number of tracks for each artist
artist_track_counts = spotify_df['artist'].value_counts()

# Filter artists with more than one track
artists_with_multiple_tracks = artist_track_counts[artist_track_counts > 1]
print("Artists with more than one popular track:", artists_with_multiple_tracks)

Artists with more than one popular track: artist
Billie Eilish    3
Dua Lipa         3
Travis Scott     3
Justin Bieber    2
Harry Styles     2
Lewis Capaldi    2
Post Malone      2
Name: count, dtype: int64


## Most Popular Artist

In [10]:
# Count the number of tracks for each artist
artist_track_counts = spotify_df['artist'].value_counts()

# Find the most popular artist (the one with the maximum number of tracks)
most_popular_artist = artist_track_counts.idxmax()
most_popular_artist_count = artist_track_counts.max()
print(f"The most popular artist is: {most_popular_artist}")
print(f"Number of popular tracks by this artist: {most_popular_artist_count}")


The most popular artist is: Billie Eilish
Number of popular tracks by this artist: 3


## Number of Artists in total have their songs in the top 50

In [11]:
# Count the number of unique artists
total_unique_artists = spotify_df['artist'].nunique()
print(f"Total number of unique artists with songs in the top 50: {total_unique_artists}")


Total number of unique artists with songs in the top 50: 40


## Albums that have more than 1 popular track

In [12]:
# Group by album and count the number of tracks for each album
album_track_counts = spotify_df['album'].value_counts()

# Filter albums with more than one track
albums_with_multiple_tracks = album_track_counts[album_track_counts > 1]
print("Albums with more than one popular track:", albums_with_multiple_tracks)

Albums with more than one popular track: album
Future Nostalgia        3
Hollywood's Bleeding    2
Fine Line               2
Changes                 2
Name: count, dtype: int64


## Number of Albums in total have their songs in the top 50

In [13]:
# Count the number of unique albums
total_unique_albums = spotify_df['album'].nunique()
print(f"Total number of unique albums with songs in the top 50: {total_unique_albums}")


Total number of unique albums with songs in the top 50: 45


## Tracks have a danceability score above 0.7

In [14]:
# Filter tracks with a danceability score above 0.7
tracks_high_danceability = spotify_df[spotify_df['danceability'] > 0.7]
print("Tracks with a danceability score above 0.7:")
print(tracks_high_danceability[['track_name', 'danceability']])

Tracks with a danceability score above 0.7:
                                       track_name  danceability
1                                    Dance Monkey         0.825
2                                         The Box         0.896
3                           Roses - Imanbek Remix         0.785
4                                 Don't Start Now         0.793
5                    ROCKSTAR (feat. Roddy Ricch)         0.746
7                death bed (coffee for your head)         0.726
8                                         Falling         0.784
10                                           Tusa         0.803
13                                Blueberry Faygo         0.774
14                       Intentions (feat. Quavo)         0.806
15                                   Toosie Slide         0.830
17                                         Say So         0.787
18                                       Memories         0.764
19                     Life Is Good (feat. Drake)         0.

## Tracks have a danceability score below 0.4

In [15]:
# Filter tracks with a danceability score below 0.4
tracks_low_danceability = spotify_df[spotify_df['danceability'] < 0.4]
print("Tracks with a danceability score below 0.4:")
print(tracks_low_danceability[['track_name', 'danceability']])

Tracks with a danceability score below 0.4:
Empty DataFrame
Columns: [track_name, danceability]
Index: []


## Tracks have their loudness above -5

In [16]:
# Filter tracks with a loudness score above -5 dB
tracks_high_loudness = spotify_df[spotify_df['loudness'] > -5]
print("Tracks with a loudness score above -5 dB:")
print(tracks_high_loudness[['track_name', 'loudness']])

Tracks with a loudness score above -5 dB:
                                       track_name  loudness
4                                 Don't Start Now    -4.521
6                                Watermelon Sugar    -4.209
10                                           Tusa    -3.280
12                                        Circles    -3.497
16                                  Before You Go    -4.858
17                                         Say So    -4.577
21                                      Adore You    -3.675
23                         Mood (feat. iann dior)    -3.558
31                                 Break My Heart    -3.434
32                                       Dynamite    -4.410
33               Supalonely (feat. Gus Dapperton)    -4.746
35                Rain On Me (with Ariana Grande)    -3.764
37  Sunflower - Spider-Man: Into the Spider-Verse    -4.368
38                                          Hawái    -3.454
39                                        Ride It    -4.25

## Tracks have their loudness below -8

In [17]:
# Filter tracks with a loudness score below -8 dB
tracks_low_loudness = spotify_df[spotify_df['loudness'] < -8]
print("Tracks with a loudness score below -8 dB:")
print(tracks_low_loudness[['track_name', 'loudness']])

Tracks with a loudness score below -8 dB:
                                        track_name  loudness
7                 death bed (coffee for your head)    -8.765
8                                          Falling    -8.756
15                                    Toosie Slide    -8.820
20                Savage Love (Laxed - Siren Beat)    -8.520
24                             everything i wanted   -12.453
26                                         bad guy   -10.965
36                             HIGHEST IN THE ROOM    -8.764
44                            lovely (with Khalid)   -10.109
47  If the World Was Ending - feat. Julia Michaels   -10.086


## Longest Track

In [18]:
# Find the track with the maximum duration
longest_track = spotify_df.loc[spotify_df['duration_ms'].idxmax()]
print("The longest track is:")
print(f"Track: {longest_track['track_name']}")

The longest track is:
Track: Safaera


## Shortest Track

In [19]:
# Find the track with the minimum duration
shortest_track = spotify_df.loc[spotify_df['duration_ms'].idxmin()]
print("The shortest track is:")
print(f"Track: {shortest_track['track_name']}")

The shortest track is:
Track: Mood (feat. iann dior)


## Most Popular genre 

In [20]:
# Count the number of tracks for each genre
genre_counts = spotify_df['genre'].value_counts()

# Find the most popular genre (the one with the maximum count)
most_popular_genre = genre_counts.idxmax()
most_popular_genre_count = genre_counts.max()
print(f"The most popular genre is: {most_popular_genre}")
print(f"Number of tracks in this genre: {most_popular_genre_count}")

The most popular genre is: Pop
Number of tracks in this genre: 14


## Genres have just one song on the top 50

In [21]:
# Count the number of tracks for each genre
genre_counts = spotify_df['genre'].value_counts()

# Filter genres with exactly one track
genres_with_one_track = genre_counts[genre_counts == 1]
print("Genres with exactly one song in the top 50:")
print(genres_with_one_track)

Genres with exactly one song in the top 50:
genre
Nu-disco                              1
R&B/Hip-Hop alternative               1
Pop/Soft Rock                         1
Pop rap                               1
Hip-Hop/Trap                          1
Dance-pop/Disco                       1
Disco-pop                             1
Dreampop/Hip-Hop/R&B                  1
Alternative/reggaeton/experimental    1
Chamber pop                           1
Name: count, dtype: int64


## Total genres that are represented in the top 50

In [22]:
# Count the number of unique genres
total_unique_genres = spotify_df['genre'].nunique()

print(f"Total number of unique genres in the top 50: {total_unique_genres}")

Total number of unique genres in the top 50: 16


## Strongly positively correlated features

In [23]:
# Select only numeric columns
numeric_df = spotify_df.select_dtypes(include=['number'])

# Compute the correlation matrix for numeric columns
correlation_matrix = numeric_df.corr()

# Setting a threshold for "strongly positive" correlation (e.g., > 0.7)
threshold = 0.7
strong_positive_correlations = correlation_matrix[(correlation_matrix > threshold) & (correlation_matrix < 1)]

print("Strongly positively correlated features:")
strong_positive_correlations.stack().reset_index()

Strongly positively correlated features:


Unnamed: 0,level_0,level_1,0
0,energy,loudness,0.792028
1,loudness,energy,0.792028


## Strongly negatively correlated features

In [24]:
# Setting a threshold for "strongly negative" correlation (e.g., < -0.7)
threshold = -0.7
strong_negative_correlations = correlation_matrix[(correlation_matrix < threshold) & (correlation_matrix > -1)]

# Display strongly negatively correlated feature pairs
print("Strongly negatively correlated features:")
strong_negative_correlations.stack().reset_index()

Strongly negatively correlated features:


Unnamed: 0,level_0,level_1,0


## Not correlated features

In [25]:
# Setting a threshold for "not correlated" (e.g., between -0.1 and 0.1)
lower_threshold = -0.1
upper_threshold = 0.1
not_correlated = correlation_matrix[
    (correlation_matrix > lower_threshold) & (correlation_matrix < upper_threshold) &
    (correlation_matrix < 1)  # Exclude self-correlation
]

# Display pairs of features that are not correlated
print("Features that are not correlated:")
not_correlated.stack().reset_index()

Features that are not correlated:


Unnamed: 0,level_0,level_1,0
0,energy,key,0.062428
1,energy,speechiness,0.091117
2,energy,liveness,0.09975
3,energy,tempo,0.075191
4,energy,duration_ms,0.050667
5,danceability,instrumentalness,0.077316
6,danceability,liveness,-0.093917
7,danceability,duration_ms,-0.048013
8,key,energy,0.062428
9,key,loudness,-0.008325


## Danceability score comparision

In [26]:
# Define the genres of interest
genres_of_interest = ['Pop', 'Hip-Hop/Rap', 'Dance/Electronic', 'Alternative/Indie']

# Filter the dataset to include only the specified genres
filtered_df = spotify_df[spotify_df['genre'].isin(genres_of_interest)]

# Group by genre and calculate the average danceability score for each genre
danceability_comparison = filtered_df.groupby('genre')['danceability'].median().reset_index()
print("Average danceability scores by genre:")
danceability_comparison

Average danceability scores by genre:


Unnamed: 0,genre,danceability
0,Alternative/Indie,0.663
1,Dance/Electronic,0.785
2,Hip-Hop/Rap,0.774
3,Pop,0.69


## Loudness score comparision

In [27]:
# Group by genre and calculate the average loudness score for each genre
loudness_comparison = filtered_df.groupby('genre')['loudness'].median().reset_index()
print("Average loudness scores by genre:")
loudness_comparison

Average loudness scores by genre:


Unnamed: 0,genre,loudness
0,Alternative/Indie,-5.2685
1,Dance/Electronic,-5.457
2,Hip-Hop/Rap,-7.648
3,Pop,-6.6445


## Acousticness score comparision

In [28]:
# Group by genre and calculate the average acousticness score for each genre
acousticness_comparison = filtered_df.groupby('genre')['acousticness'].median().reset_index()
print("Average acousticness scores by genre:")
acousticness_comparison

Average acousticness scores by genre:


Unnamed: 0,genre,acousticness
0,Alternative/Indie,0.635837
1,Dance/Electronic,0.0686
2,Hip-Hop/Rap,0.145
3,Pop,0.259


## Conclusion

- **Track and Artist Popularity**:
  - Identified multiple tracks by the same artist.
  - Determined the most popular artist.
  - Calculated the total number of unique artists and albums with multiple tracks in the top 50.

- **Feature Analysis**:
  - Analyzed danceability and loudness scores to identify tracks with specific characteristics.
  - Assessed genre popularity to determine the most represented genres in the top 50.

- **Correlation Insights**:
  - Explored strong positive and negative correlations between features.
  - Provided insights into how different track attributes relate to each other.


##  Further Improvements

- **Time Series Analysis**:
  - Analyze trends over time to understand how track popularity evolves.

- **Deeper Genre Analysis**:
  - Explore sub-genres and their impact on track success to gain more nuanced insights.

- **User Engagement Metrics**:
  - Include data on user engagement (e.g., play counts, skips) for a more comprehensive understanding of hit factors.