<a href="https://www.kaggle.com/code/shruthiiiee/spotify-data-analysis-and-visualization?scriptVersionId=144196802" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df=pd.read_csv('/kaggle/input/top-hits-spotify-from-20002019/songs_normalize.csv')

In [None]:
# Display the first few rows of the dataset
print(df.head())

In [None]:
# Description of the Data
df.describe()

In [None]:
# Get information about columns, data types, and missing values
print(df.info())

In [None]:
# Count the number of missing values in each column
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
# Check for duplicate rows
df.duplicated().value_counts()

In [None]:
# Drop all duplicates
df.drop_duplicates(inplace=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate your correlation matrix 
correlation_matrix = df.corr()

# Create a heatmap using Seaborn
plt.figure(figsize=(11,9))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Create subplots
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(10, 7))
plt.subplots_adjust(wspace=0.3, hspace=0.5)

# Plot histograms for each attribute
attributes = ['popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo']

for i, attr in enumerate(attributes):
    row = i // 3
    col = i % 3
    
    axes[row, col].hist(df[attr], bins=20, color='skyblue', edgecolor='skyblue')
    axes[row, col].set_title(attr.capitalize())
    axes[row, col].set_xlabel(attr.capitalize())
    axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(11, 7))
genre_counts = df['genre'].value_counts()
sns.barplot(x=genre_counts.index, y=genre_counts.values, palette='viridis')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Genre')
plt.ylabel('Number of Songs')
plt.title('Genre Distribution')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(11, 7))
year_counts = df['year'].value_counts().sort_index()  # Sort by year

plt.plot(year_counts.index, year_counts.values, marker='o', color='b', label='Number of Songs')
plt.fill_between(year_counts.index, year_counts.values, color='blue', alpha=0.3)

plt.xticks(rotation=45, ha='right')
plt.xlabel('Year')
plt.ylabel('Number of Songs')
plt.title('Number of Songs by Year')
plt.legend()
plt.tight_layout()
plt.show()



In [None]:
# The top artists with the most hit songs
top_artists = df['artist'].value_counts().head(10)


plt.figure(figsize=(8, 4))
top_artists.plot(kind='bar', color='skyblue')
plt.xlabel('Artist')
plt.ylabel('Number of Hit Songs')
plt.title('Top Artists with Most Hit Songs')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
genre_popularity = df.groupby('genre')['popularity'].mean()

# Find the most popular genre
most_popular_genre = genre_popularity.idxmax()

plt.figure(figsize=(11, 7))
genre_popularity.plot(kind='bar', color='skyblue')
plt.xlabel('Genre')
plt.ylabel('Mean Popularity')
plt.title('Mean Popularity by Genre')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Annotate the most popular genre
plt.annotate(f'Most Popular: {most_popular_genre}', 
             xy=(genre_popularity.index.get_loc(most_popular_genre), genre_popularity[most_popular_genre]),
             xytext=(10, -20),
             textcoords='offset points',
             arrowprops=dict(arrowstyle='->', color='red'))

plt.show()


In [None]:
# Group the DataFrame by year and get the top song for each year based on popularity
top_song_by_year = df.groupby('year').apply(lambda group: group.nlargest(1, 'popularity')).reset_index(drop=True)

# Create a grouped bar chart with song names to visualize the popularity of the top songs for each year
plt.figure(figsize=(12, 4))
plt.bar(top_song_by_year['year'], top_song_by_year['popularity'], color='skyblue', label='Top Song')
plt.xlabel('Year')
plt.ylabel('Popularity')
plt.title('Top Songs of Each Year')
plt.legend()
plt.xticks(rotation=45, ha='right')

# Annotate each bar with the name of the top song
for i, row in top_song_by_year.iterrows():
    plt.annotate(row['song'], xy=(row['year'], row['popularity']), xytext=(5, 5),
                 textcoords='offset points', fontsize=8, rotation=40)

plt.tight_layout()

plt.show()


In [None]:
top_10_songs = df.nlargest(10, 'popularity')

# Create a bar chart to visualize the popularity of the top 10 songs of all time
plt.figure(figsize=(12, 6))
plt.barh(top_10_songs['song'], top_10_songs['popularity'], color='skyblue')
plt.xlabel('Popularity')
plt.ylabel('Song')
plt.title('Top 10 Songs of All Time')
top_song = top_10_songs.iloc[0]
plt.annotate(f"Top song of all time: {top_song['song']}", 
             xy=(top_song['popularity'], top_song['song']),
             xytext=(5, 0),
             textcoords='offset points')
plt.tight_layout()

plt.show()


In [None]:

explicit_percentage = (df['explicit'].sum() / len(df)) * 100
non_explicit_percentage = 100 - explicit_percentage
labels = ['Explicit', 'Non-Explicit']
sizes = [explicit_percentage, non_explicit_percentage]
colors = ['skyblue', 'blue']
explode = (0.1, 0) 

plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140, pctdistance=0.85,
        wedgeprops={'edgecolor': 'gray'}, explode=explode)
plt.title('Percentage of Explicit in Songs')
plt.tight_layout()

centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.show()



In [None]:
explicit_percentage_by_year = df.groupby('year')['explicit'].mean() * 100
plt.figure(figsize=(10, 6))
plt.plot(explicit_percentage_by_year.index, explicit_percentage_by_year.values, marker='o', color='b')
plt.xlabel('Year')
plt.ylabel('Percentage of Explicit Songs')
plt.title('Rate of Explicit Songs Increasing with Years')
plt.ylim(0, 100)
plt.tight_layout()

plt.show()


In [None]:
top_explicit_songs = df[df['explicit'] == 1].nlargest(10, 'popularity')
plt.figure(figsize=(10, 6))
plt.scatter(top_explicit_songs['song'], top_explicit_songs['popularity'], s=top_explicit_songs['popularity']*0.2,
            c=top_explicit_songs['popularity'], cmap='Oranges', alpha=0.7)
plt.xlabel('Song')
plt.ylabel('Popularity')
plt.title('Top 10 Explicit Songs by Popularity')
plt.colorbar(label='Popularity')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.show()


In [None]:
explicit_counts_by_year = df.groupby('year')['explicit'].sum()
year_with_most_explicit = explicit_counts_by_year.idxmax()
most_explicit_count = explicit_counts_by_year.max()
plt.figure(figsize=(10, 6))
plt.bar(explicit_counts_by_year.index, explicit_counts_by_year.values, color='skyblue')
plt.bar(year_with_most_explicit, most_explicit_count, color='orange')
plt.xlabel('Year')
plt.ylabel('Number of Explicit Songs')
plt.title('Year with Most Explicit Songs')
plt.annotate(f'Most Explicit Songs ({most_explicit_count} songs)', 
             xy=(year_with_most_explicit, most_explicit_count), xytext=(-20, 15),
             textcoords='offset points', color='orange')
plt.tight_layout()

plt.show()


**Conclusion**

*Top 3 songs of all time:*

* Sweater weaether
* Another love 
* Without me 

*Top artist of all time:*

* Rihanna 
* Drake 

*Most popular genre:*

Pop, rock, folk/acoustic 
