In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [221]:
#. モジュールの読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [222]:
#. データの読み込み
df = pd.read_csv("/content/drive/MyDrive/datasienceブログ/221106_lyrics_data.csv", encoding="utf-8")

In [223]:
#. データの確認
df.head(3)

Unnamed: 0,artist,song,lyrics,flg
0,I Don't Like Monday,ダイナマイト,ループしてる同じ　毎日を繰り返し　まるで機械みたい　息を止め動いてる　車の中レディオ　音楽が...,0
1,I Don't Like Monday,愛言葉,出逢った頃とおんなじ様な　馬鹿みたく晴れた空だ　並んで歩いた緑道は　微かに夏の匂いがした ...,1
2,I Don't Like Monday,AITAI,週末の予定はなんにもないよ　あのコとの約束は　キャンセルだし 巷で有名な占いによれば　運気...,0


In [224]:
#. targetの確認
df["flg"].value_counts()

0    484
1     92
Name: flg, dtype: int64

In [225]:
#. artistの種類を確認
df["artist"].unique()

array(["I Don't Like Monday", 'SUPER BEAVER', 'YUKI', 'Galileo Galilei',
       '緑黄色社会', 'ELLEGADEN'], dtype=object)

In [226]:
#. artistを数値化
map_name = {"I Don't Like Monday":0, "SUPER BEAVER":1, "YUKI":2, 'Galileo Galilei':3, '緑黄色社会':4, 'ELLEGADEN':5}
map_idname = {0:"I Don't Like Monday", 1:"SUPER BEAVER'", 2:"YUKI", 3:'Galileo Galilei', 4:'緑黄色社会', 5:'ELLEGADEN'}

df["artist_true"] = df["artist"].map(map_name)
print(df["artist_true"].value_counts())
print(df["artist"].value_counts())

2    170
1    128
3     79
5     75
0     70
4     54
Name: artist_true, dtype: int64
YUKI                   170
SUPER BEAVER           128
Galileo Galilei         79
ELLEGADEN               75
I Don't Like Monday     70
緑黄色社会                   54
Name: artist, dtype: int64


In [227]:
#. test用のデータを取得
tmp = df.groupby("artist_true")
test_df = tmp.apply(lambda x: x.sample(n=10))  #. 各グループから10件取得
test_df = test_df.reset_index(level='artist_true', drop=True)  #. multiindexを解除

In [228]:
#データの確認
test_df.head(3)

Unnamed: 0,artist,song,lyrics,flg,artist_true
13,I Don't Like Monday,Crazy,Honey I go crazy I wanna touch your body　頭の中は君...,0,0
61,I Don't Like Monday,MEMORIES,"We are treasure, we are forever　共に未来(あす)を描くのさ　...",0,0
26,I Don't Like Monday,Zero Gravity,This is the song for you　This is the song with...,0,0


In [229]:
#dfからtestデータに含まれていないもののみを取得
test_df["kbn"] = 1  #.  testデータと分かるようにフラグ付け

df = pd.merge(df, test_df[["artist", "song", "kbn"]], on = ["artist", "song"], how = "left")
df_data = df[df["kbn"] != 1]
test_data = df[df["kbn"] == 1]

In [230]:
#不要な列を削除
df_data.drop("kbn", axis=1, inplace=True)
test_data.drop("kbn", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [231]:
#データの中身を確認
df_data["lyrics"]

0      ループしてる同じ　毎日を繰り返し　まるで機械みたい　息を止め動いてる　車の中レディオ　音楽が...
1      出逢った頃とおんなじ様な　馬鹿みたく晴れた空だ　並んで歩いた緑道は　微かに夏の匂いがした　　...
2      週末の予定はなんにもないよ　あのコとの約束は　キャンセルだし　　巷で有名な占いによれば　運気...
4      珍しいね You drink a lot today　彼氏と喧嘩でもしたのかい？　話さなくて...
5      Maybe it's all right, We can dance all night　あ...
                             ...                        
571    I am lying You are lying　We keep our backs tur...
572    Wake me up before you leave　I've got an interv...
573    君の手に　上手く馴染むもの　君の目に綺麗に映るもの　それだけでいい　君の手が今も暖かく　君の...
574    She looks smiley　He looks friendly　You maybe n...
575    None of us can help dreaming　Because a dream i...
Name: lyrics, Length: 516, dtype: object

In [232]:
print(df_data["lyrics"][4])
# 空白を確認
print(df_data["lyrics"].str.replace(" ", "●")[4])

珍しいね You drink a lot today　彼氏と喧嘩でもしたのかい？　話さなくても君のことなら　わかってしまう気がしてるよ　　こんな風にさ　二人きりだと　It's gonna be a little problem, don't you think？　だけど僕には　君の近くに　いられる理由が欲しいのさ　　If you want me to take you home　Baby it's up to you　独りでいたくないなら　Of course I'll be with you　この先はアナタ次第　二人の行方は　So you know darling, I've been falling in love　　大丈夫さ He is a good man　君たちはお似合いだとか　思ってもないことばかりを　並べてしまうオトコだけど　　君の仕草や　君の香りが　いちいち胸を締め付けるよ　上辺ばかりの優しさはいらない　今だけは僕をみてよ　　If you want me to take you home　Baby it's up to you　独りでいたくないなら　Of course I'll be with you　この先はアナタ次第　二人の行方は　So you know darling, I've been falling in love　　明日も明後日も来年もこのまま　二人でいられたならさ　Baby, it's up to you　Baby, it's up to you　もう気づいてるだろう？　You know about my feelings for you　　If you want me to take you home　Baby it's up to you　独りでいたくないなら　Of course I'll be with you　この先はアナタ次第　二人の行方は　So you know darling, I've been falling in love　　If you want me to take you home　Baby it's up to you　独りでいたくないなら　Of course I'll be with you　この先はアナタ次第　二人の行方は　So you know darling, I've been falling in

In [233]:
#文字数のカウント
df_data["len"] = df_data["lyrics"].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [234]:
df_data.groupby("artist_true")["len"].agg(["min","mean","max"]) 

Unnamed: 0_level_0,min,mean,max
artist_true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,342,890.25,2175
1,183,549.779661,839
2,147,497.76875,1262
3,100,421.681159,714
4,262,512.045455,920
5,199,924.553846,1922


YUKI、ガリレオガリレイは短いと100文字代、長くても７14文字と文字数が少ない。
一方でI dont like mondayは長いのは平均が一番高く、最大値も2,000以上となっている。

In [235]:
#. 空白のカウント
df_data["len_space"] = df_data["lyrics"].str.len() - df_data["lyrics"].str.replace(" ","").str.len()
df_data.groupby("artist_true")["len_space"].agg(["min","mean","max"]) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,min,mean,max
artist_true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,96.916667,350
1,0,1.025424,43
2,0,6.8875,105
3,0,3.550725,41
4,0,1.590909,40
5,0,148.738462,331


英語を歌詞に使用することが多い0(i dont like monday)と5(ELLEGADEN)のスペース数が他と比べて圧倒的に多い

分かち書き

In [236]:
!apt-get install -y mecab libmecab-dev mecab-ipadic-utf8
!pip install mecab-python3

import os
os.environ['MECABRC']= "/etc/mecabrc"

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libmecab-dev is already the newest version (0.996-5).
mecab is already the newest version (0.996-5).
mecab-ipadic-utf8 is already the newest version (2.7.0-20070801+main-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 5 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [237]:
import MeCab

In [238]:
wakati = MeCab.Tagger("-Owakati")

In [239]:
# 歌詞を分かち書き
df_data["lyrics_wakati"] = df_data["lyrics"].apply(lambda x: wakati.parse(x).replace("\n",""))
df_data["lyrics_wakati"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0      ループ し てる 同じ 　 毎日 を 繰り返し 　 まるで 機械 みたい 　 息 を 止め ...
1      出逢っ た 頃 と おんなじ 様 な 　 馬鹿 み たく 晴れ た 空 だ 　 並ん で 歩...
2      週末 の 予定 は なんにも ない よ 　 あの コ と の 約束 は 　 キャンセル だ ...
4      珍しい ね You drink a lot today 　 彼氏 と 喧嘩 で も し た ...
5      Maybe it ' s all right , We can dance all nigh...
                             ...                        
571    I am lying You are lying 　 We keep our backs t...
572    Wake me up before you leave 　 I ' ve got an in...
573    君 の 手 に 　 上手く 馴染む もの 　 君 の 目 に 綺麗 に 映る もの 　 それ...
574    She looks smiley 　 He looks friendly 　 You may...
575    None of us can help dreaming 　 Because a dream...
Name: lyrics_wakati, Length: 516, dtype: object

In [240]:
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.decomposition import TruncatedSVD

In [241]:
cv = CV()
lyrics_train = cv.fit_transform(df_data["lyrics_wakati"])
lyrics_train = pd.DataFrame(lyrics_train.toarray(), columns=cv.get_feature_names())

print(lyrics_train.shape)
lyrics_train.head()

(516, 9736)




Unnamed: 0,00,090,10,100,12,13,14,15,153,18,...,黒髪,黙っ,黙ら,鼓動,鼓膜,鼻先,鼻声,鼻歌,齧っ,齧ら
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [242]:
#頻出単語の確認
frequency =  np.array(lyrics_train.sum(axis=0))

for i in frequency.argsort()[:-10:-1]:
  print(frequency[i], cv.get_feature_names()[i])

2285 ない
1152 you
678 から
598 たい
523 てる
473 いる
472 あなた
472 こと
466 よう




上位１０位の頻出単語を見るとyouやあなたなど肌感覚と合う単語が出ている

In [243]:
lyrics_train["artist"] = df_data["artist_true"] 

In [244]:
#I dont like monday頻出単語の確認
frequency =  np.array(lyrics_train[lyrics_train["artist"]==0].sum(axis=0))

for i in frequency.argsort()[:-20:-1]:
  print(frequency[i], cv.get_feature_names()[i])

452.0 you
231.0 ない
181.0 baby
149.0 me
148.0 love
131.0 your
116.0 it
116.0 the
109.0 so
107.0 から
106.0 oh
88.0 be
87.0 we
83.0 don
83.0 just
81.0 wanna
79.0 do
79.0 are
77.0 my


you、baby、loveなどが頻出

In [245]:
#super beaber頻出単語の確認
pd.DataFrame(data = lyrics_train[lyrics_train["artist"]==1].sum(axis=0), columns=["count"]).sort_values(by="count", ascending=False).head(20)

Unnamed: 0,count
ない,710.0
こと,214.0
たい,191.0
あなた,186.0
てる,171.0
いる,150.0
じゃ,145.0
から,144.0
なんて,132.0
いい,123.0


In [246]:
#. ガリレオガリレイの頻出単語
pd.DataFrame(data = lyrics_train[lyrics_train["artist"]==3].sum(axis=0), columns=["count"]).sort_values(by="count", ascending=False).head(20)

Unnamed: 0,count
ない,401.0
artist,207.0
から,125.0
あなた,92.0
だけ,89.0
たい,82.0
いる,77.0
いい,74.0
てる,73.0
こと,68.0


In [247]:
#. エルレガーデンの頻出単語
pd.DataFrame(data = lyrics_train[lyrics_train["artist"]==4].sum(axis=0), columns=["count"]).sort_values(by="count", ascending=False).head(20)

Unnamed: 0,count
you,337.0
it,197.0
the,186.0
to,176.0
artist,176.0
is,143.0
can,113.0
my,103.0
me,103.0
all,84.0


In [248]:
#YUKIの頻出単語の確認
pd.DataFrame(data = lyrics_train[lyrics_train["artist"]==2].sum(axis=0), columns=["count"]).sort_values(by="count", ascending=False).head(20)

Unnamed: 0,count
ない,676.0
artist,320.0
から,208.0
たい,191.0
てる,157.0
よう,141.0
いる,129.0
なら,125.0
そう,119.0
あなた,113.0


上位１０位では「ない、から」など基本的な単語が並ぶ

In [249]:
#. 学習データを作成
x = lyrics_train.drop("artist", axis=1) #. ベクトル化したデータ
y = df_data["artist"] #. 数値化した目的変数

In [250]:
#まずはホールドアウト法を使ってみる
from sklearn.model_selection import train_test_split

#test_size=0.2
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=0)

In [251]:
#lightgbm用にデータセットを作成
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [252]:
#欠損値があるデータを確認
#ra_row = x.isna().any(axis=1)
#lyrics_train.loc[ra_row, :]

In [253]:
#学習

# 学習条件を設定
model = lgb.LGBMClassifier(objective='multiclass')

result = model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric="multi_logloss")

[1]	valid_0's multi_logloss: 1.5961	valid_0's multi_logloss: 1.5961
[2]	valid_0's multi_logloss: 1.5302	valid_0's multi_logloss: 1.5302
[3]	valid_0's multi_logloss: 1.47713	valid_0's multi_logloss: 1.47713
[4]	valid_0's multi_logloss: 1.42975	valid_0's multi_logloss: 1.42975
[5]	valid_0's multi_logloss: 1.39056	valid_0's multi_logloss: 1.39056
[6]	valid_0's multi_logloss: 1.36662	valid_0's multi_logloss: 1.36662
[7]	valid_0's multi_logloss: 1.3355	valid_0's multi_logloss: 1.3355
[8]	valid_0's multi_logloss: 1.30931	valid_0's multi_logloss: 1.30931
[9]	valid_0's multi_logloss: 1.28138	valid_0's multi_logloss: 1.28138
[10]	valid_0's multi_logloss: 1.26649	valid_0's multi_logloss: 1.26649
[11]	valid_0's multi_logloss: 1.24642	valid_0's multi_logloss: 1.24642
[12]	valid_0's multi_logloss: 1.22188	valid_0's multi_logloss: 1.22188
[13]	valid_0's multi_logloss: 1.20988	valid_0's multi_logloss: 1.20988
[14]	valid_0's multi_logloss: 1.19439	valid_0's multi_logloss: 1.19439
[15]	valid_0's multi_

In [258]:
# 歌詞を分かち書き
test_data["lyrics_wakati"] = test_data["lyrics"].apply(lambda x: wakati.parse(x).replace("\n",""))
test_data["lyrics_wakati"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


3      You have sophisticated eyes 　 You make me swee...
7      合い鍵 は 近々 　 ポスト に 入れ とい て なんて 　 味気 ない 最後 　 　 どん...
13     Honey I go crazy I wanna touch your body 　 頭 の...
20     ヒト は 誰 も が 愛 を 求め て 　 この 世界 に 生 を 受ける なら 　 おそら...
26     This is the song for you 　 This is the song wi...
29     It was a second before 　 I ' m sure we ' re so...
48     They are walking around have the same face lik...
61     We are treasure , we are forever 　 共に 未来 ( あす ...
62     この 街 に は 「 欲望 」 と 言う 名 の 　 得体 の 知れ ない 　 魔物 が 住...
66     Today 　 昨日 と 同じ 日 で は 物足りない よ 　 I need some mo...
73     後悔 の 果て に 今 　 さよなら の 先 に 今 　 消え そう な 声 で 泣い て ...
121    正直 者 は 馬鹿 を 見る ？ 　 嘘つき の 言葉 は 信じ ない 　 失敗 は 成功 ...
126    明日 に 願い を 繋ぐ ため に 　 僕ら は 今 を 戦っ て いる 　 望む 未来 が...
127    今さら 　 昔 流行っ た 本 を 　 片手間 に 読ん で 　 欠伸 する 　 退屈 を ...
140    ドクターペッパー の あの 味 は 　 懐かしい 刺激 的 な 味 　 毒 みたい だって ...
152    誰 か が 丁寧 に 用意 し て くれ た 　 『 個性 』 という 名前 の 分厚い 教...
153    わかり きっ て て 　 はぐらかし てる 　 口 に し たら 困ら せる 　 わかり き...
166    君 の 全て は 知ら なく て 　 僕 の 全

In [259]:
#textデータ
lyrics_test = cv.transform(test_data["lyrics_wakati"])
lyrics_test = pd.DataFrame(lyrics_test.toarray(), columns=cv.get_feature_names())

print(lyrics_test.shape)
lyrics_test.head()

(60, 9736)




Unnamed: 0,00,090,10,100,12,13,14,15,153,18,...,黒髪,黙っ,黙ら,鼓動,鼓膜,鼻先,鼻声,鼻歌,齧っ,齧ら
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [260]:
#テストデータで予測をする
y_pred = model.predict(lyrics_test)

In [261]:
test_data["pred"] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [262]:
test_data

Unnamed: 0,artist,song,lyrics,flg,artist_true,lyrics_wakati,pred
3,I Don't Like Monday,A GIRL IN THE CITY,You have sophisticated eyes　You make me sweete...,0,0,You have sophisticated eyes You make me swee...,I Don't Like Monday
7,I Don't Like Monday,ENTERTAINER,合い鍵は近々　ポストに入れといてなんて　味気ない最後 どんなに情熱的な恋をしたって　結局終...,0,0,合い鍵 は 近々 ポスト に 入れ とい て なんて 味気 ない 最後 どん...,YUKI
13,I Don't Like Monday,Crazy,Honey I go crazy I wanna touch your body　頭の中は君...,0,0,Honey I go crazy I wanna touch your body 頭 の...,I Don't Like Monday
20,I Don't Like Monday,SING,ヒトは誰もが愛を求めて　この世界に生を受けるなら　おそらく僕はアナタのために　アナタのために...,0,0,ヒト は 誰 も が 愛 を 求め て この 世界 に 生 を 受ける なら おそら...,I Don't Like Monday
26,I Don't Like Monday,Zero Gravity,This is the song for you　This is the song with...,0,0,This is the song for you This is the song wi...,I Don't Like Monday
29,I Don't Like Monday,SO BAD,It was a second before　I'm sure we're so in lo...,0,0,It was a second before I ' m sure we ' re so...,I Don't Like Monday
48,I Don't Like Monday,Fashion,They are walking around have the same face lik...,1,0,They are walking around have the same face lik...,I Don't Like Monday
61,I Don't Like Monday,MEMORIES,"We are treasure, we are forever　共に未来(あす)を描くのさ　...",0,0,"We are treasure , we are forever 共に 未来 ( あす ...",ELLEGADEN
62,I Don't Like Monday,モンスター,この街には「欲望」と言う名の　得体の知れない　魔物が住んでる　腹空かせて　手当たり次第に　街...,0,0,この 街 に は 「 欲望 」 と 言う 名 の 得体 の 知れ ない 魔物 が 住...,YUKI
66,I Don't Like Monday,Lemonade,Today　昨日と同じ日では物足りないよ　I need some more　カラダを濡らすよ...,1,0,Today 昨日 と 同じ 日 で は 物足りない よ I need some mo...,I Don't Like Monday


In [263]:
test_data["artist_pred"] = test_data["pred"].map(map_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [265]:
from sklearn.metrics import accuracy_score

In [266]:
accuracy_score(test_data["artist_true"], test_data["artist_pred"])

0.5833333333333334