In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import plotly as py
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/top50spotify2019/top50.csv', encoding="windows-1252")
df = df.drop(["Unnamed: 0"], axis = 1)
df = df.rename(columns = {"Track.Name":"trackName", "Artist.Name" : "artistName", "Beats.Per.Minute":"beatsPerMinute","Genre":"genre", "Loudness..dB..":"loudness","Valence.":"valence","Length.":"length","Acousticness..":"acousticness","Speechiness.":"speechiness", "Popularity":"popularity", "Energy":"energy", "Liveness":"liveliness","Danceability":"danceability"})

Let's view the first 5 elements of this dataset by using head() function.

In [None]:
df.head()

Let's get a general summary of the dataset by using info() and describe().

In [None]:
df.info()

In [None]:
df.describe().T

We see that there are 13 columns. 3 of these are objects and 10 are of the integer type. Let's get the dataset's shape.

In [None]:
df.shape

50 rows and 13 columns.

Let's view how many songs are in the top 50 based on their genres.

In [None]:
df.genre.value_counts()

Now let's see if there are any columns with missing info.

In [None]:
df.isna().sum()

As we can see, we don't have any null values.

Let's get a Correlation Table.

In [None]:
df.corr()

For better readability let's create a heatmap.

In [None]:
sns.heatmap(df.corr());

____________________________________________

Now let's create bar plots for different categories.

In [None]:
def createPlot(attribute):
    variable = df[attribute]
    count = variable.value_counts()
        
    plt.figure(figsize=(10,8))
    plt.bar(count.index, count)
    plt.tight_layout()
    plt.xlabel(attribute)
    plt.ylabel("Frequency")
    plt.xticks(count.index, count.index.values, rotation = 60)
    plt.show()

In [None]:
getPlotted = ["artistName","genre"]
for each in getPlotted:
    createPlot(each)

Now let's plot histograms with columns that have int data.

In [None]:
def createHistogram(attribute):
    
    plt.figure(figsize=(6,4.8))
    plt.hist(df[attribute], bins = 50 )
    plt.tight_layout()
    plt.xlabel(attribute)
    plt.ylabel("Counts")
    plt.xticks(rotation = 90)
    plt.show()

In [None]:
intData = ["beatsPerMinute","energy","danceability","loudness","liveliness","valence","length","acousticness","speechiness","popularity"]

for each in intData:
    createHistogram(each)

For better readability let's create some visualizations for our columns.

In [None]:
plt.figure(figsize=(12,5))
ax = sns.countplot(x ='genre', data = df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set_xticklabels(ax.get_xticklabels(), fontsize=8)
plt.tight_layout()
plt.show()

Let's get the unique artist names.

In [None]:
df.artistName.unique()

Now let's compare the artist by thier popularity.

In [None]:
plt.figure(figsize=(12,5))
ax = sns.barplot(x='artistName', y='popularity', data=df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set_xticklabels(ax.get_xticklabels(), fontsize=8)
plt.color
plt.tight_layout()
plt.show()

Let's visualize the correlation between liveliness and popularity.

In [None]:
sns.jointplot(x = "liveliness", y = "popularity",
              kind = "kde", data = df)
plt.show()
plt.tight_layout()

As we can see, the less "lively" a song is the more "popular" it becomes. A similar observation can be made with a line plot.

In [None]:
sns.lmplot(x="liveliness", y="popularity", data=df)
plt.show()

----

Let's get the mean, standart deviation and max values of some columns.

In [None]:
df["length"].mean()

In [None]:
df["beatsPerMinute"].max()

In [None]:
df["popularity"].std()

Finally let's create a Pie Chart of the Top 50 based on genres.

In [None]:
pie = df.genre.value_counts()
labels = pie.index
colors = ["salmon","lemonchiffon","aquamarine","darkslateblue"]
explode = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
print(len(explode))
sizes = pie.values

plt.figure(figsize = (10,10))
plt.pie(sizes, explode = explode, labels = labels, colors = colors, autopct = '%1.1f%%')
plt.show()