In [None]:
# Opening the .csv files as a DataFrame
# Importing pandas and numpy for usage throughout as pd and np respectively
import pandas as pd
import numpy as np
from collections import Counter

# Importing matplotlib and seaborn for visualisations & plotting
import matplotlib.pyplot as plt
import seaborn as sns

# For lineplot functionality, seaborn needs to be on this version:
# conda install -c anaconda seaborn=0.9.0

# Importing to show plots in the notebook
%matplotlib inline

# Saving excel data to variable dataFile and passing file to dataframe variable df
# , encoding='latin-1'
dataFile = "../input/spotify-dataset-19212020-160k-tracks/data.csv"
dataFileByArtist = "../input/spotify-dataset-19212020-160k-tracks/data_by_artist.csv"
dataFileByGenres = "../input/spotify-dataset-19212020-160k-tracks/data_by_genres.csv"
dataFileByYear = "../input/spotify-dataset-19212020-160k-tracks/data_by_year.csv"
dataFileWithGenre = "../input/spotify-dataset-19212020-160k-tracks/data_w_genres.csv"

df = pd.read_csv(dataFile)
df_byArtist = pd.read_csv(dataFileByArtist)
df_byGenres = pd.read_csv(dataFileByGenres)
df_byYear = pd.read_csv(dataFileByYear)
df_withGenre = pd.read_csv(dataFileWithGenre)

In [None]:
# Find Number of entries in Dataframe (Total Number of Songs in Dataset)
totalSongs = df['loudness'].count()
totalSongs

In [None]:
# Find Mean Tempo of the Songs (Sum of All Observations / Total Number of Observations)
sumOfLoudness = df['loudness'].sum()
meanLoudness = sumOfLoudness / totalSongs
meanLoudness

In [None]:
# Find Mode i.e Number or Name which appears most 
# We will count which artist's songs appeared most in the dataframe
def mode(sample):
    c = Counter(sample)
    for k, v in c.items():
         if v == c.most_common(1)[0][1]:
            return k,v  

mode(df['year'])

In [None]:
df

In [None]:
# Getting the length of the rows in the dataframe
len(df)

In [None]:
# Getting the number of columns
len(df.columns)

In [None]:
# Printing the first 5 rows
df.head()

In [None]:
# Printing the last 5 rows
df.tail()

In [None]:
# Printing the data types
df.dtypes

In [None]:
# Getting some statistics about the numerical data
df.describe().T

In [None]:
# Getting information about the object datatypes
category_columns = df.select_dtypes(['object']).columns
df[category_columns].describe().T

In [None]:
# Another method for getting the rows and columns
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Some basic charts 
plt.figure()
df.hist(figsize=(20, 20))

In [None]:
df_byArtist.hist(figsize=(20, 20))
plt.show()

In [None]:
df_byGenres.hist(figsize=(20, 20))
plt.show()

In [None]:
df_byYear.hist(figsize=(20, 20))
plt.show()

In [None]:
df_withGenre.hist(figsize=(20, 20))
plt.show()

In [None]:
plt.figure(figsize=(16, 4))
sns.distplot(df["liveness"])

In [None]:
plt.figure(figsize=(16, 4))
sns.distplot(df_byGenres["liveness"])

In [None]:
plt.figure(figsize=(16, 4))
sns.distplot(df_byYear["liveness"])

In [None]:
plt.figure(figsize=(16, 4))
sns.set(style="whitegrid")
x = df.groupby("name")["popularity"].mean().sort_values(ascending=False).head(20)
axis = sns.barplot(x.index, x)
axis.set_title('Top Tracks with Popularity')
axis.set_ylabel('Popularity')
axis.set_xlabel('Tracks')
plt.xticks(rotation = 90)

In [None]:
plt.figure(figsize=(16, 4))
sns.set(style="whitegrid")
x = df.groupby("artists")["popularity"].sum().sort_values(ascending=False).head(20)
ax = sns.barplot(x.index, x)
ax.set_title('Top Artists with Popularity')
ax.set_ylabel('Popularity')
ax.set_xlabel('Artists')
plt.xticks(rotation = 90)

In [None]:
plt.figure(figsize=(16, 8))
sns.distplot(df.popularity,bins=20)

In [None]:
df["duration_ms"].describe()

In [None]:
df["popularity"].describe()

In [None]:
df[df["popularity"] > 95]["name"]

In [None]:
corr = df.corr()
sns.clustermap(corr,cmap="coolwarm")

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
x = df.groupby("name")["popularity"].mean().sort_values(ascending=False).head(10)
ax = sns.barplot(x.index, x)
ax.set_title('Top Tracks with Popularity')
ax.set_ylabel('Popularity')
ax.set_xlabel('Tracks')
plt.xticks(rotation = 90)

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
x = df.groupby("artists")["popularity"].sum().sort_values(ascending=False).head(10)
ax = sns.barplot(x.index, x)
ax.set_title('Top Artists with Popularity')
ax.set_ylabel('Popularity')
ax.set_xlabel('Artists')
plt.xticks(rotation = 90)

In [None]:
# Popularity of Genres with respect to the various features
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
cols = ["valence","popularity","acousticness","instrumentalness","speechiness","danceability" ]
sns.pairplot(df_byGenres[cols])
plt.show();