In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/spotify-song-performance-dataset/spotify_data.csv")

# A LOOK AT THE DATA 

In [None]:
df.head()

## Check for null values

In [None]:
df.isnull().sum()

#### Daily has 2 null values. We can drop those records 

In [None]:
df.dropna(inplace=True)

### Final Record Count post Null removal

In [None]:
df.shape

### DATA PREPROCESSING

Let us first separate the song and artist

In [None]:
df['Artist']=df['Songs & Artist'].apply(lambda s:s.split("-")[0])
df['Song']=df['Songs & Artist'].apply(lambda s:s.split("-")[1])
df.drop(columns=['Songs & Artist'],inplace=True)

In [None]:
df=df[['Song','Artist','Streams','Daily']]

In [None]:
df.head()

## Exploratory Data Analysis

### Let us find the top Artists and Songs

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df2=df.groupby("Artist").agg({"Streams":"mean"}).reset_index().sort_values(by="Streams",ascending=False)

In [None]:
df2=df.groupby("Artist").agg({"Streams":"mean"}).reset_index().sort_values(by="Streams",ascending=False)
plt.figure(figsize=(15,5))
plt.title("TOP 10 ARTISTS BASED ON STREAMING COUNTS")
sns.barplot(x='Artist',y='Streams',data=df2.head(10))

### TOP ARTISTS BASED ON THEIR DAILY STREAMING ACTIVITY

In [None]:
df2=df.groupby("Artist").agg({"Daily":"mean"}).reset_index().sort_values(by="Daily",ascending=False)
plt.figure(figsize=(15,5))
plt.title("TOP 10 ARTISTS BASED ON DAILY COUNTS")
sns.barplot(x='Artist',y='Daily',data=df2.head(10))

## Lets check the relationship between Overall Streaming vs Daily Streaming

In [None]:
df[['Daily','Streams']].corr()

In [None]:
sns.scatterplot(x='Streams',y='Daily',data=df.groupby("Artist").agg({'Streams':'mean','Daily':'mean'}).reset_index(),markers=True)

## INFERENCE:

DAILY Streaming Activity is linearly correlated with Overall Streaming Activity, though there are a few outliers

## DATA ENCODING AND TEST, TRAIN SPLITS

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
import category_encoders as ce



Train, Test Splits

In [None]:
## We will use binary encoder to encode Artists because there are many unique artists ( more than 900)

In [None]:
binary_t=ce.BinaryEncoder(cols=['Artist'])
artists_transformed = binary_t.fit_transform(df[['Artist']])
df2 = pd.concat([df,artists_transformed],axis=1)


## 

In [None]:
df2

In [None]:
df2.drop(columns=['Artist','Song'],inplace=True)

In [None]:
df2

In [None]:
from sklearn.model_selection import train_test_split
X = df2.drop(columns=['Daily'])
y=df2['Daily']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [None]:
scaler = ColumnTransformer([("scaler",StandardScaler(),["Streams"])],remainder='passthrough')
X_train=scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
scaler = StandardScaler()
Y_train_transformed=scaler.fit_transform(y_train.values.reshape(-1,1))
Y_test_transformed=scaler.transform(y_test.values.reshape(-1,1))