In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Analysis of Spotify Top-2000 songs

### Content:
* 1.Introduction
* 2.Description of project
* 3.Research questions
* 4.Data preparation:cleaning and shaping  
* 5.Data visualization

## 1. Introduction
Over the years, listening to music has developed with tremendous success and today one of the most popular music services is the streaming service Spotify, with nearly 300 million users worldwide. The service contains a database of 60 million tracks of all genres.Those songs that achieve great success among the audience remain in memory for a long time. Their influence on people can be so great that they leave their contribution to history. Songs can sometimes show us the development of entire generations.So, main goal of the project is to research and see power of the music.

## 2. Description of project
The popularity factor is central to this research.
The analysis is based on ready dataset taken from site kaggle.com : 
(https://www.kaggle.com/iamsumat/spotify-top-2000s-mega-dataset).
Given dataset contains audio statistics of the top 2000 tracks on Spotify. The data contains about 15 columns each describing the track and it's qualities.Chosen tracks were released in period from 1956 to 2019 and there were included songs from some notable and famous artists like Queen, The Beatles, Guns N' Roses, etc.
This data is extracted from the Spotify playlist - Top 2000s on PlaylistMachinery(@plamere) using Selenium with Python. More specifically, it was scraped from http://sortyourmusic.playlistmachinery.com/ This data contains audio features like Danceability, BPM, Liveness, Valence(Positivity) and many more.
Each feature's description has been given in detail below.
* Index is ID.
* Title- name of the track.
* Artist- name of the artist.
* Top Genre is the genre of track.
* Year is the year that track was released.
* Beats per Minute(BPM) - The tempo of the song.
* Energy: The energy of a song - the higher the value, the more energtic song is.
* Danceability - The higher the value, the easier to dance to this song.
* Loudness - The higher the value, the louder the song is.
* Liveness - The higher the value , the more lively song hears to listener.
* Valence - The higher the value, the more positive mood for the song is.
* Length - duration of the song.
* Acoustic - The higher the value the more acoustic the song is.
* Speechiness- The higher the value the more spoken words the song contains.
* Popularity - The higher the value the more popular the song is.


##  3.Formulation of research questions
#### For detailed and deep analysis, we need to answer to the following questions:
1. Analyze the most known Artists in the world of all times
2. Analyze the most listened genres of all times
3. Analyze the in which years people had listened dance and energetic songs
4. Find trend for acousticness in songs was popular in 1960s, than they are today
5. Find which words contained in songs are more popular?

## 4.Data preparation: cleaning and shaping
for well-working with data, we need to prepare it to right format.
* find all the missing values.
* renaming some columns
* change values in column 'Genre'
* convert column Length(duration) from object to int type
* convert all the negative values in column 'loudness' to the positive
* grouping data by columns 

In [None]:
# importing useful packages
import numpy as np
import pandas as pd
import csv
import random
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# making data frame from csv file  and making the index from column "Index"
top = pd.read_csv('/kaggle/input/spotify-top-2000s-mega-dataset/Spotify-2000.csv',delimiter = ",",
             encoding = "utf - 8", doublequote=True, engine="python")
top.head(5)

In [None]:
#show size of table- in rows and columns
top.shape

In [None]:
#showing all info about columns
top.info()

In [None]:
#counting types of columns
top.dtypes.value_counts()

In [None]:
# get all the statistical derscription of numerical columns
top.describe()

In [None]:
#finding missing values on each column
top.isnull().sum()

In [None]:
#finding duplicated records in dataset
top.duplicated().sum()

In [None]:
#renaming columns
top1 = top.rename(columns={"Length (Duration)": "Duration (sec)"})
top1.head()

In [None]:
#so , because we find no missing values, next we are going to modificate our data.
#in this step, we will change type and values in column "Length(Duration)"
top1['Duration (sec)'].dtype

In [None]:
#replacing comma from numbers that make them string values by using replace() method
#and convert into integer by using to_numeric and apply methods
top1.replace(',','', regex=True, inplace=True)
top1['Duration (sec)']= top1['Duration (sec)'].apply(pd.to_numeric,errors='coerce')
top1['Duration (sec)'].dtype

In [None]:
#show the results
top1['Duration (sec)']

In [None]:
#describing the statistical part of column
top1['Duration (sec)'].describe()

In [None]:
#change the column name of Top Genre
top1.rename(columns = {'Top Genre' : 'Genre'}, inplace = True)
top1.head()

In [None]:
#merging similar genres to the one general
#conert all value's lettercases to lower
genre = (top1["Genre"].str.strip()).str.lower()
top1.head(3)

In [None]:
# function to split the genre column
def genre_splitter(genre):
    result = genre.copy()
    result = result.str.split(" ",1)
    for i in range(len(result)):
        if (len(result[i]) > 1):
            result[i] = [result[i][1]]
    return result.str.join('')
#loop until the genre cannot be split any further
genre_m1 = genre.copy()
while(max((genre_m1.str.split(" ", 1)).str.len()) > 1):
    genre_m1 = genre_splitter(genre_m1)
len(genre_m1.unique())

In [None]:
genre_m1.value_counts()

In [None]:
#changing the column values to the results
top1['Genre'] = genre_m1
top1.head(5)

In [None]:
#convert negative values in "Loudness " column to positive by using abs() function
top1['Loudness (dB)'] = top1['Loudness (dB)'].abs()
top1.head(5)

In [None]:
#next, we'll define how many artists participate in our research and how many songs of these artists are in this dataset
top1['Artist'].value_counts()

In [None]:
#sorting songs from most to less popular
top1.sort_values(['Popularity'], ascending = False)

In [None]:
#here we define highest value of popularity by each genre
top1.groupby(['Genre']).aggregate({'Popularity' : 'max'}).sort_values(['Popularity'], ascending = False)

## 5.Data visualisation

In [None]:
# before we begin visualisation of particular researching, we draw correlation matrix for all data in our analysis
plt.figure(figsize=(10,8))
corrMatrix = top1.corr()
sns.heatmap(corrMatrix, annot = True)

### 5.1 Q1: Analyze the most known Artists in the world of all times
* count amount of songs of each artist
* define artists with high marks of popularity in general term
* draw plots

In [None]:
#find most occuring artists
print(top1[['Artist', 'Popularity']].groupby('Artist')['Popularity'].count().nlargest(5).reset_index(name='top5 most occurring'))

In [None]:
#draw barplot for result
fig = plt.figure(figsize = (15,7))
top1[['Artist', 'Popularity']].groupby('Artist')['Popularity'].count().nlargest(5) \
.reset_index(name='top5 most occurring').plot(kind = 'bar')
plt.xlabel('Artist Index', fontsize = 20)
plt.ylabel('Count of songs', fontsize = 20)
plt.title('Amount of songs of Artists', fontsize = 30)

In [None]:
#find top5 popular songs
print(top1[['Artist', 'Popularity']].groupby('Artist')['Popularity'].max().nlargest(5).reset_index(name='top5 most popular'))

In [None]:
#draw barplot for the result
top1[['Artist', 'Popularity']].groupby('Artist')['Popularity'].max().nlargest(5) \
.reset_index(name='top5 most popular').plot(kind = 'bar')
plt.xlabel('Artist Index', fontsize = 20)
plt.ylabel('Popularity of song', fontsize = 20)
plt.title('Popular song of Artists', fontsize = 30)

So, we define that most Artists with most popular songs are Tones and I , Billie Eilish, Imagine Dragons, Maroon 5, Marriah Carey.
And most occurring artists are groups Queen,The Beatles, Coldplay,U2, the Rolling Stones

### 5.2 Q2 : Analyze the most listened genres of all times
* count amount of songs in each genre
* get genres with high marks of popularity each year  
* plot the given result

In [None]:
#Calculating the number of songs of each genre
print(type(top1['Genre']))
popular_genre=top1.groupby('Genre').size()
popular_genre.sort_values(ascending = False)

In [None]:
#draw a pieplot for result
labels = top1.Genre.value_counts().index
sizes = top1.Genre.value_counts().values
colors = ['red', 'yellowgreen', 'coral', 'blue','cyan', 'green', 'black','yellow']
plt.figure(figsize = (10,10))
plt.pie(sizes, labels=labels, colors=colors)
autopct=('%1.1f%%')
plt.axis('equal')
plt.show()

In [None]:
#find genres with highest value of popularity
top1.groupby(['Genre']).aggregate({'Popularity' : 'max'}).sort_values('Popularity',ascending = False).head(5)

In [None]:
#draw a lineplot for result
fig, ax = plt.subplots(figsize=(15,7))
top1.groupby(['Genre']).max()['Popularity'].plot(ax = ax)
plt.xlabel('genre', fontsize = 20)
plt.ylabel('Popularity', fontsize = 20)
plt.title('Popularity of genre', fontsize = 30)

we researched that most popular genres are pop, rock, electropop,standards,metal,room, mellow

### 5.3 Q3: define in which time period people had listened more energetic songs 
*  to find this, we research data in columns 'Beats per Minute(BPM)', 'Energy ' and 'Dancebility'
*  draw a matching plots


In [None]:
#get data with highest bpm value
top1.loc[: ,'Title':'Beats Per Minute (BPM)'].sort_values('Beats Per Minute (BPM)', ascending = False).head(5)

In [None]:
#draw a distribution plot
plt.figure(figsize=(8,4))
sns.distplot(top1['Beats Per Minute (BPM)'], kde=False, bins=15,color='red', hist_kws=dict(edgecolor="k", linewidth=1))
plt.show()

In [None]:
#get data with highest energy value
top1.loc[: ,'Title':'Energy'].sort_values('Energy', ascending = False).head(5)

In [None]:
#draw a distribution plot
plt.figure(figsize=(8,4))
sns.distplot(top1['Energy'], kde=False, bins=15,color='aqua', hist_kws=dict(edgecolor="black", linewidth=1))
plt.show()

In [None]:
top1.loc[: ,'Title':'Danceability'].sort_values('Danceability', ascending = False).head(5)

In [None]:
#draw a distribution plot
plt.figure(figsize=(8,4))
sns.distplot(top1['Danceability'], kde=False, bins=15,color='darkorchid', hist_kws=dict(edgecolor="black", linewidth=1))
plt.show()

In [None]:
# Analysing the relationship between energy and year
plt.figure(figsize=(14,7))
regplot=sns.lineplot(x="Year",y='Energy',data=top1, label = 'Energy',marker='o')
regplot=sns.lineplot(x="Year",y='Danceability', data=top1, label = 'Danceability', marker='o')
regplot=sns.lineplot(x="Year",y='Beats Per Minute (BPM)',data=top1, label = 'Beats Per Minute (BPM)', marker='o')
regplot.set_title("Relationship between Years and Energy, Danceability and BPM")


found that 
* more energetic songs were listened in period 2000-2010
* more dance songs were listened in period 1970-1980
* more more beat songs were listened in period 1960-1970

### 5.4 Q4: Find if  trend for acousticness in songs was popular in 1960s, than they are today
*  to find this, we research data in column 'Acousticness'
*  draw a matching plots


In [None]:
top1[['Title','Acousticness','Year']].sort_values('Acousticness').head(5)

In [None]:
#draw a distplot
sns.distplot( top1['Acousticness'] , color="skyblue")

In [None]:
#joinplot for analyzing acousticness
plt.figure(figsize=(16,9))
sns.jointplot(x=top1['Year'], y=top1['Acousticness'], kind="kde")

here we can see that trend to acousticness was less in 2010s than in 1960s

## Conclusion
In this data analysis, we made a research to the top 2000 songs of Spotify music service.
We conclude that:
* firstly, we find most popular Artist and their popular songs
* second, find the most popular genres of music
* third, find trend in time periods on several characteristics of track which are- Energy, acousticness, danceability
Our analysis has shown various data that can help with more advanced research of music trends and their various indicators to improve and optimize various services such as spotify, etc.