In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Library for mathematical computation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Library for modelling
from sklearn.preprocessing import LabelEncoder #Encodes Categorical Data into Numerical
from sklearn.model_selection import train_test_split #For spliting data into training and testing sets
from sklearn.linear_model import LinearRegression #Linear Regression model
from sklearn.linear_model import Lasso #Lasso regression model
from sklearn.linear_model import Ridge #Ridge Regression Model
from sklearn.ensemble import RandomForestRegressor#Random Forest Regression Model


#Library for Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Reading CSV Data
dataset=pd.read_csv("/kaggle/input/spotify-top-200-charts-20202021/spotify_dataset.csv")
dataset.head(10)

In [None]:
#Displaying metadata/information about the dataset
dataset.info()

# Cleaning Data

As the dataset is small, Microsoft Excel was used to clean the data.
The Following steps were taken to clean the dataset.
1. Changing Datatypes of Data Columns
2. Removing values with more than 80% Null values
3. Removing Data Columns that are Irrelevant to the analysis.


In [None]:
#loading Clean Data
cleaned_data=pd.read_csv("../input/spotify-cleaned-data/Spotify_Dataset_Cleaned.csv")
cleaned_data.head()

In [None]:
#Displaying Cleaned Data info
cleaned_data.info()

In [None]:
#Displaying Stats of Cleaned_data
cleaned_data.describe()

# Data Visualisation

Visualization Includes
* Top 10 Streames Genres
* Top 10 Most Followed Artist
* Top 10 Highest Charting Songs
* Popularity based on Loudness

In [None]:
#Sorting by Streams
sort_streams=cleaned_data.sort_values(by=['Streams'],ascending=False)
#Plotting Barchart about top 10 Genres and their total Streams
plt.figure(figsize=(20,8))
plt.title("Top 10 Streamed Genres")
sns.barplot(x='Genre',y='Streams',data=sort_streams.head(10))

In [None]:
#Sorting the dataset by Followers
sort_followers=cleaned_data.sort_values(by=['Artist Followers'],ascending=False)
#Plotting Barchart about artist and their followers
plt.figure(figsize=(20,8))
plt.title("Top 10 Most Followed Artists")
sns.barplot(x='Artist',y='Artist Followers',data=sort_streams.head(10))

In [None]:
#Sorting the dataset by Highest Charting Position
sort_chart=cleaned_data.sort_values(by=['Highest Charting Position'],ascending=True)
sort_chart
#Plotting Barchart about artist and their followers
plt.figure(figsize=(20,8))
plt.title("Top 10 Highest Charted Songs")
sns.barplot(x='Number of Times Charted',y='Song Name',data=sort_chart.head(10))

In [None]:
#Scatterplot of Popularity of Genres based on Loudness
plt.figure(figsize=(10,8))
plt.title("Popularity based on Loudness")
sns.scatterplot(x='Popularity',y='Loudness',data=cleaned_data)

# Correlations

In [None]:
#Calculating Correlation
corr=cleaned_data.corr()

#Plotting Correlation
plt.figure(figsize=(12,12))
sns.heatmap(corr,annot=True,cmap="GnBu")

# Prediction on Streams

**Preparation for the Modelling**

First we are going to drop some data columns such as Artist Name, Song Name and Release Date as they are not relevant for prediction.

In [None]:
#Dropping the data columns 
cleaned_data = cleaned_data.drop(["Artist", "Song Name", "Release Date"], axis=1)
cleaned_data.head()

Now we must encode Genre into Numerical value, as Model cannot train having string value

In [None]:
#Encoding Genre
genre_encoder = LabelEncoder()
cleaned_data['Genre'] = genre_encoder.fit_transform(cleaned_data['Genre'])
cleaned_data.head()

**Splitting the data into Training and Testing Sets**

In [None]:
training_data, testing_data = train_test_split(cleaned_data, test_size=0.2, random_state=25)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

testing_data

In [None]:
#Dropping Stream Column From Test data
testing_data_stream=testing_data['Streams']
testing_data=testing_data.drop('Streams', axis=1)
testing_data.head()

In [None]:
#Setting training data into x_train and y_train
x_train=training_data.drop('Streams',axis=1)
y_train=training_data['Streams']

#Shapes of x_train,y_train and test data
x_train.shape, y_train.shape, testing_data.shape

# Training and Testing Models

**Linear Regression**

In [None]:
#Linear Regression Modelling and Training
linear_model=LinearRegression()
linear_model.fit(x_train,y_train)
print(linear_model.score(x_train,y_train))

#testing the model and Displaying the output
linear_predict=linear_model.predict(testing_data)
l_output=pd.DataFrame({'Id':testing_data.index,'Predicted Streams':linear_predict,'Actual Stream':testing_data_stream})
print(l_output)

In [None]:
#Random Forest Regression
r_model=RandomForestRegressor(n_estimators=50)
r_model.fit(x_train,y_train)
print(r_model.score(x_train,y_train))

#Making predictions on test set 
r_predict=r_model.predict(testing_data)

r_output=pd.DataFrame({'Id':testing_data.index,'Predicted Streams':r_predict,'Actual Stream':testing_data_stream})
print(r_output)

In [None]:
#Building the model in Ridge Regression Model
ridge_model=Ridge(alpha=1.0)
ridge_model.fit(x_train,y_train)
print(ridge_model.score(x_train,y_train))

#Testing the model
ridge_predict=ridge_model.predict(testing_data)

#Creating dataframe to store ID with prediction
ridge_output=pd.DataFrame({'Id':testing_data.index,'Predicted Streams':ridge_predict,'Actual Stream':testing_data_stream})
print(ridge_output)

So, this is the Streaming prediction based on different features like artist followers, genre and other features using 3 different regression models. Among this models, it seems Random Forest Regression had the prediction more closer to the actual streams.

Thank you for visiting my notebook and Any suggestions will be Appreciated. 