In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from wordcloud import WordCloud,STOPWORDS
import spacy as sp
import string
import nltk
import re
import plotly.express as ex
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
pyo.init_notebook_mode()
nltk.download('vader_lexicon')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
nlps = sp.load('en')
from nltk.util import ngrams
from nltk import word_tokenize
def RMSE(y,yh):
    return np.sqrt(mean_squared_error(y,yh))


plt.rc('figure',figsize=(18,11))

In [None]:
d_data=pd.read_csv('/kaggle/input/drake-lyrics/drake_data.csv',usecols=['album','lyrics_title','lyrics','track_views'])
d_data.head()

In [None]:
d_data.shape

In [None]:
d_data.describe()

In [None]:
preprocessed = d_data.copy()
for col in preprocessed.columns[:-1]:
    preprocessed[col] = preprocessed[col].str.lower()

    
preprocessed.lyrics_title     = preprocessed.lyrics_title.apply(lambda x: x.replace('lyrics',''))
preprocessed['is_demo']       = preprocessed.lyrics_title.apply(lambda x: 1 if x.find('(demo)') != -1 else 0)
preprocessed.lyrics_title     = preprocessed.lyrics_title.apply(lambda x: x.replace('(demo)',''))


def extract_feat(s):
    artist = s[s.find("(")+1:s.find(")")]
    if artist.find('ft.') != -1:
        return artist.replace('ft. ','')
    else:
        return 'solo'
    
def remove_artist(s):
    artist = s[s.find("(")+1:s.find(")")]
    if artist.find('ft.') != -1:
        return s[:s.find("(")]
    else:
        return s
    
preprocessed['featuring'] = preprocessed.lyrics_title.apply(extract_feat)
preprocessed.lyrics_title = preprocessed.lyrics_title.apply(remove_artist)
preprocessed.lyrics_title = preprocessed.lyrics_title.apply(remove_artist)
preprocessed.lyrics_title = preprocessed.lyrics_title.apply(lambda x:re.sub(r'[^\w\s]', '', x) )

def view_preprocess(s):
    if type(s)!= float:
        if s[-1]=='K':
            return float(s.replace('K',''))*1000
        if s[-1]=='M':
            return float(s.replace('M',''))*10**6

preprocessed.track_views = preprocessed.track_views.apply(view_preprocess)


preprocessed = preprocessed.loc[preprocessed.lyrics.dropna().index,:]

preprocessed['number_of_verses']=0
preprocessed['number_of_chorus']=0
preprocessed.loc[preprocessed.lyrics.notna().index,'number_of_verses'] = preprocessed.lyrics[preprocessed.lyrics.notna()].apply(lambda x:len( re.findall(r'verse',x)))
preprocessed.loc[preprocessed.lyrics.notna().index,'number_of_chorus'] = preprocessed.lyrics[preprocessed.lyrics.notna()].apply(lambda x:len( re.findall(r'chorus',x)))

preprocessed.loc[preprocessed.lyrics.notna().index,'lyrics'] = preprocessed.lyrics[preprocessed.lyrics.notna()].apply(lambda x: re.sub(r'\[([^]]*)]','',x))
preprocessed.loc[preprocessed.lyrics.notna().index,'lyrics'] = preprocessed.lyrics[preprocessed.lyrics.notna()].apply(lambda x: x.replace('\n',' '))
preprocessed.loc[preprocessed.lyrics.notna().index,'lyrics'] = preprocessed.lyrics[preprocessed.lyrics.notna()].apply(lambda x:re.sub(r'[^\w\s]', '', x))


sid = SIA()
preprocessed['sentiments']           = preprocessed['lyrics'].apply(lambda x: sid.polarity_scores(x))
preprocessed['Positive Sentiment']   = preprocessed['sentiments'].apply(lambda x: x['pos']) 
preprocessed['Neutral Sentiment']    = preprocessed['sentiments'].apply(lambda x: x['neu'])
preprocessed['Negative Sentiment']   = preprocessed['sentiments'].apply(lambda x: x['neg'])

preprocessed.drop(columns=['sentiments'],inplace=True)

preprocessed['# Of Words']                 = preprocessed['lyrics'].apply(lambda x: len(x.split(' ')))
preprocessed['# Of StopWords']             = preprocessed['lyrics'].apply(lambda x: len([word for word in x.split(' ') if word in list(STOPWORDS)]))
preprocessed['Average Word Length']        = preprocessed['lyrics'].apply(lambda x: np.mean(np.array([len(va) for va in x.split(' ') if va not in list(STOPWORDS)])))
preprocessed['Average Sentence Length']    = preprocessed['lyrics'].apply(lambda x: np.mean(np.array([len(va) for va in x.split('.')])))

album_d = preprocessed.groupby(by='album').mean()
album_d.drop(index='unreleased songs',inplace=True)
album_d['Release_Year'] = 0
album_d.loc['care package','Release_Year']                         =2019
album_d.loc['certified lover boy','Release_Year']                  =2021
album_d.loc['comeback season','Release_Year']                      =2007
album_d.loc['dark lane demo tapes','Release_Year']                 =2020
album_d.loc['drake demo disk','Release_Year']                      =2006
album_d.loc['if you’re reading this it’s too late','Release_Year'] =2015
album_d.loc['more life','Release_Year']                            =2017
album_d.loc['nothing was the same','Release_Year']                 =2013
album_d.loc['room for improvement','Release_Year']                 =2006
album_d.loc['scary hours','Release_Year']                          =2018
album_d.loc['scorpion','Release_Year']                             =2018
album_d.loc['so far gone','Release_Year']                          =2009
album_d.loc['so far gone (ep)','Release_Year']                     =2009
album_d.loc['take care','Release_Year']                            =2011
album_d.loc['thank me later','Release_Year']                       =2010
album_d.loc['the best in the world pack','Release_Year']           =2019
album_d.loc['views','Release_Year']                                =2016


preprocessed.head(3)

    
    


In [None]:
preprocessed.head(1)

In [None]:
plt.title("Amount of Missing Values Per Feature",fontsize=19,fontweight='bold')
sns.heatmap(preprocessed.isna().sum().to_frame(),annot=True,cmap='nipy_spectral')
plt.show()

In [None]:
plt.title('Number Of Songs Associated With Each Album',fontsize=19,fontweight='bold')
ax = sns.barplot(y=preprocessed.album.value_counts().index,x=preprocessed.album.value_counts().values,palette='nipy_spectral')
ax.set_yticklabels(ax.get_yticklabels(),fontsize=15,fontweight='bold')
plt.show()

In [None]:
plt.title('Number Of Songs Labeled as Demo Songs',fontsize=19,fontweight='bold')
ax = sns.countplot(preprocessed.is_demo,palette='nipy_spectral')
plt.show()

In [None]:
plt.title('Top 10 Most Featured Artist In Drakes Songs',fontsize=19,fontweight='bold')
ax = sns.barplot(y=preprocessed.featuring.value_counts()[1:11].index,x=preprocessed.featuring.value_counts()[1:11].values,palette='nipy_spectral')
ax.set_yticklabels(ax.get_yticklabels(),fontsize=15,fontweight='bold')
plt.show()

In [None]:
plt.subplot(3,1,1)
plt.title('Distribution Of Song Views Before Log Transformation',fontsize=19,fontweight='bold',color='b')
sns.kdeplot(preprocessed.track_views)
plt.subplot(3,1,2)
preprocessed.track_views= np.log(preprocessed.track_views)
plt.title('Distribution Of Song Views After Log Transformation',fontsize=19,fontweight='bold',color='r')
sns.kdeplot(preprocessed.track_views,color='r')
plt.subplot(3,1,3)
plt.title('CDF Of Song Views After Log Transformation',fontsize=19,fontweight='bold',color='tab:red')
sns.kdeplot(preprocessed.track_views,color='r',cumulative=True)
plt.show()

In [None]:
plt.subplot(2,1,1)
plt.title('Distribution Of Different Amounts of Verses In Drakes Songs',fontsize=19,fontweight='bold',color='g')
sns.kdeplot(preprocessed.number_of_verses,color='g')
plt.subplot(2,1,2)
plt.title('Distribution Of Different Amounts of Choruses In Drakes Songs',fontsize=19,fontweight='bold',color='b')
sns.kdeplot(preprocessed.number_of_chorus)

In [None]:
fig = make_subplots(rows=2, cols=1,shared_xaxes=True,subplot_titles=('Perason Correaltion',  'Spearman Correaltion'))



s_val =preprocessed.corr('pearson')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
    go.Heatmap(x=s_col,y=s_idx,z=s_val,name='pearson',showscale=False,xgap=1,ygap=1),
    row=1, col=1
)
s_val =preprocessed.corr('spearman')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
    go.Heatmap(x=s_col,y=s_idx,z=s_val,xgap=1,ygap=1),
    row=2, col=1
)
fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    )
)
fig.update_layout(height=700, width=900, title_text="Correlations Between Our Different Numeric Features")
fig.show()


In [None]:
fig = go.Figure()
album_d = album_d.sort_values(by='Release_Year')
album_dm = album_d.groupby(by='Release_Year').mean().reset_index()

for column in album_dm.columns[:-1]:
    fig.add_trace(
        go.Scatter(
            x = album_dm.Release_Year,
            y = album_dm[column],
            name = column,
        )
    )
    

btns = []
for x,col in enumerate(album_dm.columns[:-1]):
    bol = [False]*12
    bol[x]=True
    d = dict(label = col,
                  method = 'update',
                  args = [{'visible':bol},
                          {'title': 'Distribution of [' +col+'] Over The Years',
                           'showlegend':True}])
    btns.append(d)
    
    
fig.update_layout(title='Feautres Distribution Over The Years',
    updatemenus=[go.layout.Updatemenu(
        active=0,
        showactive=True,
        buttons=btns
        )
    ])
fig.show()