In [None]:
#!/usr/bin/env python
# coding: utf-8

# # TIME SERIES ANALYSIS

# 
# 

# In[ ]:


# Import the libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
#import plotly.express as px
from sklearn import linear_model  # will be using for plotting trend line
from sklearn.preprocessing import MinMaxScaler # for normalizing data
from sklearn.cluster import KMeans
from tqdm.autonotebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
get_ipython().run_line_magic('matplotlib', 'inline')


# In[ ]:


# Import the data
df = pd.read_csv("C:/Users/ANUSHKA/Desktop/Technocolabs Mini Project/data.csv")
df_genre = pd.read_csv("C:/Users/ANUSHKA/Desktop/Technocolabs Mini Project/data_by_genres.csv")

df_year = pd.read_csv("C:/Users/ANUSHKA/Desktop/Technocolabs Mini Project/data_by_year.csv")
df_genre2 = pd.read_csv("C:/Users/ANUSHKA/Desktop/Technocolabs Mini Project/data_w_genres.csv")
# View the shape and columns names
print(df.shape)
df.columns


# In[ ]:


# Check for missing values
df.isnull().sum()


# In[ ]:


# Drop unneccessary columns
df.drop(["id", "key", "mode", "explicit", "release_date"], axis=1, inplace=True)
df.head()


# In[ ]:


corr = df[["acousticness","danceability","energy", "instrumentalness", 
           "liveness","tempo", "valence", "loudness", "speechiness"]].corr()

plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True)


# Song Trends

# In[ ]:


year_avg = df[["acousticness","danceability","energy", "instrumentalness", 
               "liveness","tempo", "valence", "loudness", "speechiness", "year"]].\
groupby("year").mean().sort_values(by="year").reset_index()

year_avg.head()


# In[ ]:


# Create a line plot
plt.figure(figsize=(14,8))
plt.title("Song Trends Over Time", fontdict={"fontsize": 15})

lines = ["acousticness","danceability","energy", 
         "instrumentalness", "liveness", "valence", "speechiness"]

for line in lines:
    ax = sns.lineplot(x='year', y=line, data=year_avg)
    
    
plt.ylabel("value")
plt.legend(lines)


# # DIFFERENTIATE GENRES

# In[ ]:


# lets perform clustering
# data(columns) we will we using
song_features = pd.DataFrame()
# normalizer instance
scaler = MinMaxScaler()
for col in df.iloc[:,:-1].columns:      # excluding year col i.e, of int64 type
    if df[col].dtypes in ['float64', 'int64']:
        # adding normalized col
        scaler.fit(df[[col]])
        song_features[col] = scaler.transform(df[col].values.reshape(-1,1)).ravel()


# In[ ]:


# first we would like to know that how many cluster or to say Genres can be clustered 
# with less SSE(Sum of Squared Error) we will use "Elbow method" to find out 

# KMeans instance
km = KMeans()
k_rng = range(1,200)  # k value
sse = [] # sse value for each k
for i in k_rng:
    km = KMeans(n_clusters = i)
    km.fit(song_features.sample(1000))
    # calculating sse
    sse.append(km.inertia_) 
    
# due to less computation power I am unable to use whole data 
# I guess 1000 sample of whole data can depict actual


# In[ ]:


plt.plot(k_rng,sse)
plt.xlabel('K value')
plt.ylabel('SSE Error')
plt.title('Best K value')
# plt.ylim(0,400)
# plt.xlim(0,100)
plt.show()


# In[ ]:


# looks like 25 is good value of K
km = KMeans(n_clusters=25)
predicted_genres = km.fit_predict(song_features)


# In[ ]:


song_features['predicted_genres'] = predicted_genres
song_features['predicted_genres'] = song_features['predicted_genres'].apply(lambda x: 'Genre'+ str(x))


# In[ ]:


song_features.sample(10)


# In[ ]:


# lets see how many songs falls in each Genre and which Genre have more songs
genres_grp = song_features.groupby(['predicted_genres']).size()
plt.figure(figsize=(10,6))
genres_grp.sort_values(ascending=True).plot.barh(color='red')
plt.xlabel('Total Songs')
plt.title('Genre Ranking')
plt.show()


# # RECOMMEND ARTISTS

# In[ ]:


# reading artists data
artists_df = pd.read_csv("C:/Users/ANUSHKA/Desktop/Technocolabs Mini Project/data_by_artist.csv")
artists_df = artists_df.rename(columns={"count": "playCount"})


# In[ ]:


# we will replace each feature with its Genre for our convience and for easy tracking
artists_df.iloc[:,1:-1] = scaler.fit_transform(artists_df.iloc[:,1:-1])
km = KMeans(n_clusters=25)
artists_df['genres'] = km.fit_predict(artists_df.iloc[:,1:-1])
artists_df = artists_df.iloc[:,[0,-3,-2,-1]]
artists_df.head()


# In[ ]:


# lets create our own user list with his rating and add to artists data
artists_df['user_id'] = np.random.randint(1000,1400,len(artists_df))
artists_df['rating'] = np.random.randint(1,6,len(artists_df))
artists_df.head()


# In[ ]:


# lets create our recommender system
def recommend_me(user):
    """This function will recommend artists to any user with its genre profile"""
    # first we will choose user top liked genres
    fav_genre = artists_df[artists_df['user_id']==user].sort_values(by=['rating','playCount'], ascending=False)['genres'][:5]
    fav_genre = list(dict.fromkeys(fav_genre)) # removing duplicate if exits
    
    # lets clear out the artists from list whose songs has been listened by the user
    listened_artist = artists_df.index[artists_df['artists'].isin(['Johann Sebastian Bach','Frédéric Chopin'])].tolist()
    
    # rest data
    remaining_artist = artists_df.drop(listened_artist, axis=0)
    CanBeRecommened =  remaining_artist[remaining_artist['genres'].isin(fav_genre)]
    
    # now lets sort our artists whose are popular in this user favorite genre
    CanBeRecommened = CanBeRecommened.sort_values(by=['rating','playCount',], ascending=False)[['artists', 'genres', 'rating', 'playCount']][:5]
    
    # output will contain artists name, genres, other useres rating and song played count
    return CanBeRecommened


# In[ ]:


# lets recommend this user some artists
recommend_me(1012)


# In[ ]:


# lets check which genre is user fav and did he get same recommended
artists_df[artists_df.user_id==1012].sort_values(by='rating')['genres'].unique()


# # POPULARITY PREDICTION

# In[ ]:


features = ['year', 'danceability', 'energy', 'loudness', 'tempo']
tracks_data = df.copy()
features_tracks_data = df[features]


# In[ ]:


class Artist: 
    def __init__(self, name, popularity): 
        self.name = name
        self.popularity = popularity
        
        
class Track: 
    def __init__(self, name, artists, popularity): 
        self.name = name
        self.artists = artists
        self.popularity = popularity   
        
        

tracks = []

names = tracks_data.name.values
artists_names = tracks_data.artists.values
popularity = tracks_data.popularity.values

for index in range(len(names)): 
    track = Track(names[index], artists_names[index], popularity[index])
    tracks.append(track)
    
    
artists = []
artists_names_done = []
artists_popularities = []

for artists_str in tqdm(artists_names): 
    artists_sub_list = artists_str[1:-1].split(', ')
    
    track_pop = 0
    for artist in artists_sub_list: 
        
        if artist in artists_names_done: 
            a = [x for x in artists if x.name == artist][0]
            artist_pop = a.popularity
            
        else: 
            songs_pop = [x.popularity for x in tracks if artist in x.artists]
            artist_pop = sum(songs_pop) / len(songs_pop)
            artists_names_done.append(artist)
            a = Artist(artist, artist_pop)
            artists.append(a)
        
        track_pop += artist_pop
        
    track_pop /= len(artists_sub_list)
    artists_popularities.append(track_pop)
    
artists_popularities = np.asarray(artists_popularities)

print(artists_popularities.max())


# In[ ]:


scaler = MinMaxScaler()
scaler.fit(features_tracks_data)
features_tracks_data = scaler.transform(features_tracks_data)

print(features_tracks_data.shape) 
features_tracks_data = np.column_stack((artists_popularities / 100, features_tracks_data))
print(features_tracks_data.shape)

y_tracks_data = tracks_data.popularity.values / 100

X_train, X_test, y_train, y_test = train_test_split(features_tracks_data, y_tracks_data, test_size=0.2, random_state=27)


# In[ ]:


for column in range(X_train.shape[1]): 
    print(X_train[:, column].min(), X_train[:, column].max())


# In[ ]:


clf = RandomForestRegressor()
clf.fit(X_train, y_train)


# In[ ]:


preds = clf.predict(X_test)

accuracy = clf.score(X_test, y_test)
print("Test Accuracy: {:.4f}".format(accuracy*100))

average_error = (abs(y_test - preds)).mean()
print("{:.4f} average error".format(average_error))


# In[ ]:


for index in range(len(preds[:100])): 
    
    pred = preds[index]
    actual = y_test[index]
    
    print("Actual / Predicted: {:.4f} / {:.4f}".format(actual, pred))


# # Deployment Using Streamlit

# In[ ]:


import streamlit as st



# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]: