In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt  
%matplotlib inline

In [None]:
#read the data 
df = pd.read_csv('/kaggle/input/bts-lyrics/lyrics.csv', encoding='UTF-8')
df.head(5)

In [None]:
#Find number of songs in each album
albums = df['eng_album_title'].value_counts().sort_values(ascending=False)
albums

In [None]:
#Plot a bar graph for the result obtained above
fig = plt.figure(figsize = (19, 9))
ax1 = fig.add_subplot()

x_1 = albums.index
y_1 = albums.values

for x_, y_ in zip(x_1, y_1):
    ax1.bar(x_, y_, color = "red" if y_ < y_1.mean() else "green", alpha = 0.3)
    ax1.text(x_, y_ - y_/2, str(y_), horizontalalignment = 'center')

plt.xticks(rotation=80)
ax1.set_xlabel('Albums')
ax1.set_ylabel('Songs')
ax1.set_title("Songs in each albums")

plt.show()

In [None]:
#Find the number of songs released each year
year_songs = pd.Series(i.split("-",1)[0] for i in df['album_rd'])
songs_every_year = year_songs.value_counts().sort_values()
songs_every_year

In [None]:
#Plot a Line graph
fig = plt.figure(figsize = (10, 8))

ax2 = fig.add_subplot()
x = songs_every_year.index
y = songs_every_year.values
ax2.plot(x,y, marker='o', linestyle='-', color='b', label='Songs',alpha=0.5) 
ax2.set_xlabel('Year')
ax2.set_ylabel('Songs Released') 
ax2.set_title('Songs Released In Each Year')
ax2.legend(loc = "upper left") 

for a,b in zip(x, y): 
    plt.text(a, b, str(b))

plt.show()

In [None]:
#The count of Featured artists 
featured = df['featured'].value_counts().sort_values().dropna()

plt.figure(figsize=(13,8))
featured.plot.bar(color='b', alpha=0.5)
plt.xticks(rotation=50)
plt.xlabel("Artists")
plt.ylabel("Amount of Songs")
plt.title("Featured Artists")

plt.show()

In [None]:
#Count of Songs by Artists/Group
performed_by = df['performed_by'].value_counts().sort_values()
performed_by

In [None]:
#Bar graph for the count of songs  
fig = plt.figure(figsize = (19, 9))
ax1 = fig.add_subplot()

x_1 = performed_by.index
y_1 = performed_by.values

for x_, y_ in zip(x_1, y_1):
    ax1.bar(x_, y_, color = "red" if y_ <  y_1.mean() else "green", alpha = 0.3)
    ax1.text(x_, y_ - y_/2, str(y_), horizontalalignment = 'center')

plt.xticks(rotation=90)
ax1.set_xlabel('Artists/Group')
ax1.set_ylabel('Songs')
ax1.set_title("Count of Songs by Artists/Group")

plt.show()

In [None]:
#Find the Language frequencies
lang = df['lang'].value_counts().sort_values()
lang

In [None]:
#Plot a Pie chart for the result obtained above
fig = plt.figure(figsize = (8, 6))
ax = fig.add_subplot()
explode = (0, 0.1)
colors = np.arange(4)
ax.pie(lang.values, explode=explode,labels = lang.index,shadow=True,
autopct = '%1.1f%%',textprops = {'fontsize': 15, 'color' : "black"})
ax.set_title("Song Languages")
ax.axis('equal')
plt.show()

In [None]:
def clean(val_list): #method to eliminate additional redundant words
  lst = []
   
  words = ['oh','s','la','yeah','ll','re','na','don','know','na','want','pt','remix','ft','outro','into','interlude','edition','full','length','skit']

  for val in val_list:
    flag = 1
    i = val.lower().strip(":.")
    for w in words:
      if i == w:
        flag = 0

    if flag ==1:
      lst.append(i)
  return lst

In [None]:
#wordcloud for Album titles
comment_words = '' 
stopwords = set(STOPWORDS) 
  
 
for val in df.eng_album_title[:]: 
    val = str(val) 
    tokens = val.split()
 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "
  
wordcloud = WordCloud(width = 700, height = 500, background_color ='black', stopwords = stopwords, min_font_size = 10).generate(comment_words) 
                       
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

In [None]:
#wordcloud for Song Lyrics
comment_words = '' 
stopwords = set(STOPWORDS) 
  
for val in df.lyrics[:]: 
    val = str(val)
    tokens = clean(val.split()) 

    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "
  
wordcloud = WordCloud(width = 500, height = 700, background_color ='black', stopwords = stopwords, min_font_size = 10).generate(comment_words) 
                         
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

In [None]:
#wordcloud for Song titles
comment_words = '' 
stopwords = set(STOPWORDS) 
  
for val in df.eng_track_title[:]: 
    val = str(val) 
    tokens = clean(val.split()) 
 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "
  
wordcloud = WordCloud(width = 400, height = 700, background_color ='black', stopwords = stopwords, min_font_size = 10).generate(comment_words) 
                       
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()