In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#. モジュールの読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
#. データの読み込み
df = pd.read_csv("/content/drive/MyDrive/datasienceブログ/221106_lyrics_data.csv", encoding="utf-8")

In [6]:
#. データの確認
df.head(3)

Unnamed: 0,artist,song,lyrics,flg
0,I Don't Like Monday,ダイナマイト,ループしてる同じ　毎日を繰り返し　まるで機械みたい　息を止め動いてる　車の中レディオ　音楽が...,0
1,I Don't Like Monday,愛言葉,出逢った頃とおんなじ様な　馬鹿みたく晴れた空だ　並んで歩いた緑道は　微かに夏の匂いがした ...,1
2,I Don't Like Monday,AITAI,週末の予定はなんにもないよ　あのコとの約束は　キャンセルだし 巷で有名な占いによれば　運気...,0


In [7]:
#. targetの確認
df["flg"].value_counts()

0    484
1     92
Name: flg, dtype: int64

In [8]:
#. artistの種類を確認
df["artist"].unique()

array(["I Don't Like Monday", 'SUPER BEAVER', 'YUKI', 'Galileo Galilei',
       '緑黄色社会', 'ELLEGADEN'], dtype=object)

In [9]:
#. artistを数値化
map_name = {"I Don't Like Monday":0, "SUPER BEAVER":1, "YUKI":2, 'Galileo Galilei':3, '緑黄色社会':4, 'ELLEGADEN':5}
map_idname = {0:"I Don't Like Monday", 1:"SUPER BEAVER'", 2:"YUKI", 3:'Galileo Galilei', 4:'緑黄色社会', 5:'ELLEGADEN'}

df["artist_true"] = df["artist"].map(map_name)
print(df["artist_true"].value_counts())
print(df["artist"].value_counts())

2    170
1    128
3     79
5     75
0     70
4     54
Name: artist_true, dtype: int64
YUKI                   170
SUPER BEAVER           128
Galileo Galilei         79
ELLEGADEN               75
I Don't Like Monday     70
緑黄色社会                   54
Name: artist, dtype: int64


In [10]:
#. test用のデータを取得
tmp = df.groupby("artist_true")
test_df = tmp.apply(lambda x: x.sample(n=10))  #. 各グループから10件取得
test_df = test_df.reset_index(level='artist_true', drop=True)  #. multiindexを解除

In [11]:
#データの確認
test_df.head(3)

Unnamed: 0,artist,song,lyrics,flg,artist_true
6,I Don't Like Monday,美しき世界,泣いていようと　笑ってようと　命という旋律が紡いだ奇妙な世界　嗚呼　皆んな皆んな　踊ってる　...,0,0
22,I Don't Like Monday,スクロール。(Arranged by tofubeats),何もしないでグダグダしてる間に　夏が終わってしまったな　あぁ　今年こそ残すよ　爪痕をって　あ...,0,0
7,I Don't Like Monday,ENTERTAINER,合い鍵は近々　ポストに入れといてなんて　味気ない最後 どんなに情熱的な恋をしたって　結局終...,0,0


In [12]:
#dfからtestデータに含まれていないもののみを取得
test_df["kbn"] = 1  #.  testデータと分かるようにフラグ付け

df = pd.merge(df, test_df[["artist", "song", "kbn"]], on = ["artist", "song"], how = "left")
df_data = df[df["kbn"] != 1]
test_data = df[df["kbn"] == 1]

In [13]:
#不要な列を削除
df_data.drop("kbn", axis=1, inplace=True)
test_data.drop("kbn", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [14]:
#データの中身を確認
df_data["lyrics"]

0      ループしてる同じ　毎日を繰り返し　まるで機械みたい　息を止め動いてる　車の中レディオ　音楽が...
1      出逢った頃とおんなじ様な　馬鹿みたく晴れた空だ　並んで歩いた緑道は　微かに夏の匂いがした　　...
2      週末の予定はなんにもないよ　あのコとの約束は　キャンセルだし　　巷で有名な占いによれば　運気...
3      You have sophisticated eyes　You make me sweete...
4      珍しいね You drink a lot today　彼氏と喧嘩でもしたのかい？　話さなくて...
                             ...                        
571    I am lying You are lying　We keep our backs tur...
572    Wake me up before you leave　I've got an interv...
573    君の手に　上手く馴染むもの　君の目に綺麗に映るもの　それだけでいい　君の手が今も暖かく　君の...
574    She looks smiley　He looks friendly　You maybe n...
575    None of us can help dreaming　Because a dream i...
Name: lyrics, Length: 516, dtype: object

In [15]:
print(df_data["lyrics"][4])
# 空白を確認
print(df_data["lyrics"].str.replace(" ", "●")[4])

珍しいね You drink a lot today　彼氏と喧嘩でもしたのかい？　話さなくても君のことなら　わかってしまう気がしてるよ　　こんな風にさ　二人きりだと　It's gonna be a little problem, don't you think？　だけど僕には　君の近くに　いられる理由が欲しいのさ　　If you want me to take you home　Baby it's up to you　独りでいたくないなら　Of course I'll be with you　この先はアナタ次第　二人の行方は　So you know darling, I've been falling in love　　大丈夫さ He is a good man　君たちはお似合いだとか　思ってもないことばかりを　並べてしまうオトコだけど　　君の仕草や　君の香りが　いちいち胸を締め付けるよ　上辺ばかりの優しさはいらない　今だけは僕をみてよ　　If you want me to take you home　Baby it's up to you　独りでいたくないなら　Of course I'll be with you　この先はアナタ次第　二人の行方は　So you know darling, I've been falling in love　　明日も明後日も来年もこのまま　二人でいられたならさ　Baby, it's up to you　Baby, it's up to you　もう気づいてるだろう？　You know about my feelings for you　　If you want me to take you home　Baby it's up to you　独りでいたくないなら　Of course I'll be with you　この先はアナタ次第　二人の行方は　So you know darling, I've been falling in love　　If you want me to take you home　Baby it's up to you　独りでいたくないなら　Of course I'll be with you　この先はアナタ次第　二人の行方は　So you know darling, I've been falling in

In [46]:
#文字数のカウント
df_data["len"] = df_data["lyrics"].str.len()
test_data["len"] = test_data["lyrics"].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
df_data.groupby("artist_true")["len"].agg(["min","mean","max"]) 

Unnamed: 0_level_0,min,mean,max
artist_true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,342,918.866667,2175
1,183,548.110169,839
2,147,498.5125,1262
3,139,424.072464,714
4,262,527.409091,920
5,199,923.723077,1922


YUKI、ガリレオガリレイは短いと100文字代、長くても７14文字と文字数が少ない。
一方でI dont like mondayは長いのは平均が一番高く、最大値も2,000以上となっている。

In [18]:
#. 空白のカウント
df_data["len_space"] = df_data["lyrics"].str.len() - df_data["lyrics"].str.replace(" ","").str.len()
df_data.groupby("artist_true")["len_space"].agg(["min","mean","max"]) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,min,mean,max
artist_true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,105.416667,350
1,0,1.025424,43
2,0,6.9,105
3,0,4.246377,41
4,0,1.590909,40
5,0,148.369231,331


In [47]:
test_data["len_space"] = test_data["lyrics"].str.len() - test_data["lyrics"].str.replace(" ","").str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


英語を歌詞に使用することが多い0(i dont like monday)と5(ELLEGADEN)のスペース数が他と比べて圧倒的に多い

分かち書き

In [19]:
!apt-get install -y mecab libmecab-dev mecab-ipadic-utf8
!pip install mecab-python3

import os
os.environ['MECABRC']= "/etc/mecabrc"

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libmecab2 mecab-ipadic mecab-jumandic mecab-jumandic-utf8 mecab-utils
The following NEW packages will be installed:
  libmecab-dev libmecab2 mecab mecab-ipadic mecab-ipadic-utf8 mecab-jumandic
  mecab-jumandic-utf8 mecab-utils
0 upgraded, 8 newly installed, 0 to remove and 5 not upgraded.
Need to get 29.0 MB of archives.
After this operation, 277 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libmecab2 amd64 0.996-5 [257 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libmecab-dev amd64 0.996-5 [308 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 mecab-utils amd64 0.996-5 [4,856 B]
Get:4 http://archive.ubuntu.com/ubu

In [20]:
import MeCab

In [21]:
wakati = MeCab.Tagger("-Owakati")

In [22]:
# 歌詞を分かち書き
df_data["lyrics_wakati"] = df_data["lyrics"].apply(lambda x: wakati.parse(x).replace("\n",""))
df_data["lyrics_wakati"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0      ループ し てる 同じ 　 毎日 を 繰り返し 　 まるで 機械 みたい 　 息 を 止め ...
1      出逢っ た 頃 と おんなじ 様 な 　 馬鹿 み たく 晴れ た 空 だ 　 並ん で 歩...
2      週末 の 予定 は なんにも ない よ 　 あの コ と の 約束 は 　 キャンセル だ ...
3      You have sophisticated eyes 　 You make me swee...
4      珍しい ね You drink a lot today 　 彼氏 と 喧嘩 で も し た ...
                             ...                        
571    I am lying You are lying 　 We keep our backs t...
572    Wake me up before you leave 　 I ' ve got an in...
573    君 の 手 に 　 上手く 馴染む もの 　 君 の 目 に 綺麗 に 映る もの 　 それ...
574    She looks smiley 　 He looks friendly 　 You may...
575    None of us can help dreaming 　 Because a dream...
Name: lyrics_wakati, Length: 516, dtype: object

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer as TV
from sklearn.decomposition import TruncatedSVD

In [24]:
tv = TV()
lyrics_train = tv.fit_transform(df_data["lyrics_wakati"])
lyrics_train = pd.DataFrame(lyrics_train.toarray(), columns=tv.get_feature_names())

print(lyrics_train.shape)
lyrics_train.head()

(516, 9665)




Unnamed: 0,00,090,10,100,12,13,14,15,153,17,...,黒髪,黙っ,黙ら,鼓動,鼓膜,鼻先,鼻声,鼻歌,齧っ,齧ら
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210504,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
#頻出単語の確認
frequency =  np.array(lyrics_train.sum(axis=0))

for i in frequency.argsort()[:-10:-1]:
  print(frequency[i], tv.get_feature_names()[i])

37.79588803684216 ない
23.841626512163025 you
15.209032189355405 から
15.20381235709324 あなた
14.658532818954459 たい
13.645994764772967 てる
12.760109502150637 いる
12.682436353999085 こと
12.18459622401753 よう




上位１０位の頻出単語を見るとyouやあなたなど肌感覚と合う単語が出ている

In [26]:
lyrics_train["artist"] = df_data["artist_true"] 

In [27]:
#I dont like monday頻出単語の確認
frequency =  np.array(lyrics_train[lyrics_train["artist"]==0].sum(axis=0))

for i in frequency.argsort()[:-20:-1]:
  print(frequency[i], tv.get_feature_names()[i])

8.412810502706172 you
4.16084430183486 baby
3.1617483389018717 ない
3.156612740937395 love
2.8631588426823567 your
2.592241914322691 me
2.308417629141438 so
2.162831189797345 oh
2.0767645128116445 be
1.974254553532546 for
1.947515387132931 から
1.8849288492190257 it
1.849340706858607 the
1.8010809237935028 アナタ
1.77315904109869 いる
1.7568697304923349 たい
1.7555450415081757 are
1.7531075384841819 この
1.6037396905056553 is


you、baby、loveなどが頻出

In [28]:
#super beaber頻出単語の確認
pd.DataFrame(data = lyrics_train[lyrics_train["artist"]==1].sum(axis=0), columns=["count"]).sort_values(by="count", ascending=False).head(20)

Unnamed: 0,count
artist,118.0
ない,11.307794
こと,5.501407
あなた,4.973046
じゃ,4.473761
てる,4.463725
たい,4.409963
ある,3.990893
いる,3.645399
から,3.63994


In [29]:
#. ガリレオガリレイの頻出単語
pd.DataFrame(data = lyrics_train[lyrics_train["artist"]==3].sum(axis=0), columns=["count"]).sort_values(by="count", ascending=False).head(20)

Unnamed: 0,count
artist,207.0
ない,7.286192
あなた,3.148144
だけ,3.056846
いい,2.743734
から,2.729703
僕ら,2.46404
もう,2.276413
こと,2.212121
たい,2.133289


In [30]:
#. エルレガーデンの頻出単語
pd.DataFrame(data = lyrics_train[lyrics_train["artist"]==4].sum(axis=0), columns=["count"]).sort_values(by="count", ascending=False).head(20)

Unnamed: 0,count
artist,176.0
you,6.462389
the,4.353315
to,4.285791
it,3.977728
we,2.848886
can,2.789417
me,2.774571
my,2.645147
is,2.618641


In [31]:
#YUKIの頻出単語の確認
pd.DataFrame(data = lyrics_train[lyrics_train["artist"]==2].sum(axis=0), columns=["count"]).sort_values(by="count", ascending=False).head(20)

Unnamed: 0,count
artist,320.0
ない,11.611677
たい,4.928593
から,4.785912
てる,4.403975
よう,4.167939
あなた,4.036806
いる,3.563503
なら,3.265202
そう,3.116615


上位１０位では「ない、から」など基本的な単語が並ぶ

In [31]:
lyrics_train["len"] = df_data["len"]
lyrics_train["len_space"] = df_data["len_space"]

In [32]:
#. 学習データを作成
x = lyrics_train.drop("artist", axis=1) #. ベクトル化したデータ
y = df_data["artist"] #. 数値化した目的変数

In [33]:
#まずはホールドアウト法を使ってみる
from sklearn.model_selection import train_test_split

#test_size=0.2
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=0)

In [34]:
#lightgbm用にデータセットを作成
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [35]:
#欠損値があるデータを確認
#ra_row = x.isna().any(axis=1)
#lyrics_train.loc[ra_row, :]

In [36]:
#学習

# 学習条件を設定
model = lgb.LGBMClassifier(objective='multiclass')

result = model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric="multi_logloss")

[1]	valid_0's multi_logloss: 1.60112	valid_0's multi_logloss: 1.60112
[2]	valid_0's multi_logloss: 1.52932	valid_0's multi_logloss: 1.52932
[3]	valid_0's multi_logloss: 1.47799	valid_0's multi_logloss: 1.47799
[4]	valid_0's multi_logloss: 1.42616	valid_0's multi_logloss: 1.42616
[5]	valid_0's multi_logloss: 1.39101	valid_0's multi_logloss: 1.39101
[6]	valid_0's multi_logloss: 1.35283	valid_0's multi_logloss: 1.35283
[7]	valid_0's multi_logloss: 1.3244	valid_0's multi_logloss: 1.3244
[8]	valid_0's multi_logloss: 1.2999	valid_0's multi_logloss: 1.2999
[9]	valid_0's multi_logloss: 1.27682	valid_0's multi_logloss: 1.27682
[10]	valid_0's multi_logloss: 1.25112	valid_0's multi_logloss: 1.25112
[11]	valid_0's multi_logloss: 1.22999	valid_0's multi_logloss: 1.22999
[12]	valid_0's multi_logloss: 1.21077	valid_0's multi_logloss: 1.21077
[13]	valid_0's multi_logloss: 1.18832	valid_0's multi_logloss: 1.18832
[14]	valid_0's multi_logloss: 1.17215	valid_0's multi_logloss: 1.17215
[15]	valid_0's mult

In [37]:
# 歌詞を分かち書き
test_data["lyrics_wakati"] = test_data["lyrics"].apply(lambda x: wakati.parse(x).replace("\n",""))
test_data["lyrics_wakati"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


6      泣い て いよ う と 　 笑っ てよ う と 　 命 という 旋律 が 紡い だ 奇妙 な...
7      合い鍵 は 近々 　 ポスト に 入れ とい て なんて 　 味気 ない 最後 　 　 どん...
12     外 を 出 たら 突然 に 　 今年 初めて みる 雪 　 懐かしく て 切ない よう な ...
22     何 も し ない で グダグダ し てる 間 に 　 夏 が 終わっ て しまっ た な 　...
31     We found our light in the brightest night 　 貴方...
37     It ' s Friday night 　 On the Internet 　 Everyo...
54     Please , babe 　 Come closer to my side and hol...
56     本当は スパイ 映画 の 主役 みたい な 　 キザ で ロマンティック な 台詞 考え て...
58     冷静 さ 保っ て 　 クレバー に いこ う YO 　 ハイ ブランド まとっ て 　 カ...
61     We are treasure , we are forever 　 共に 未来 ( あす ...
87     ああ 　 楽 で は ない 日々 の 　 隙間 に それ が 一筋 でも 　 嬉しい 涙 が...
130    答え 合わ せ 　 う や むや に 　 味 の し ない 恋 を 噛ん で た 　 我慢 ...
135    朝 見 た テレビ に よれ ば 　 今日 は にわか雨 が 降る ん だって !?　 ど ...
143    はい 皆さん 一 日 　 なんとなく 過ごし てる かい ?　 うん 、 確か に 俺 も ...
152    誰 か が 丁寧 に 用意 し て くれ た 　 『 個性 』 という 名前 の 分厚い 教...
162    好き な こと 　 好き な 人 　 大切 に し てる こだわり 　 胸 を 張っ て 口...
167    「 他 に 何 も 要ら ない 」 　 よく 見りゃ 紙一重 の 強 さ で 　 追い求め ...
173    今 一 度 声 を 嗄らし て 　 響かせる よ

In [39]:
#textデータ
lyrics_test = tv.transform(test_data["lyrics_wakati"])
lyrics_test = pd.DataFrame(lyrics_test.toarray(), columns=tv.get_feature_names())

print(lyrics_test.shape)
lyrics_test.head()

(60, 9665)




Unnamed: 0,00,090,10,100,12,13,14,15,153,17,...,黒髪,黙っ,黙ら,鼓動,鼓膜,鼻先,鼻声,鼻歌,齧っ,齧ら
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
#テストデータで予測をする
y_pred = model.predict(lyrics_test)

In [41]:
test_data["pred"] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [42]:
test_data

Unnamed: 0,artist,song,lyrics,flg,artist_true,lyrics_wakati,pred
6,I Don't Like Monday,美しき世界,泣いていようと　笑ってようと　命という旋律が紡いだ奇妙な世界　嗚呼　皆んな皆んな　踊ってる　...,0,0,泣い て いよ う と 笑っ てよ う と 命 という 旋律 が 紡い だ 奇妙 な...,Galileo Galilei
7,I Don't Like Monday,ENTERTAINER,合い鍵は近々　ポストに入れといてなんて　味気ない最後 どんなに情熱的な恋をしたって　結局終...,0,0,合い鍵 は 近々 ポスト に 入れ とい て なんて 味気 ない 最後 どん...,YUKI
12,I Don't Like Monday,gift,外を出たら突然に　今年初めてみる雪　懐かしくて切ないような匂い 恥ずかしいから街中で　子供...,0,0,外 を 出 たら 突然 に 今年 初めて みる 雪 懐かしく て 切ない よう な ...,SUPER BEAVER
22,I Don't Like Monday,スクロール。(Arranged by tofubeats),何もしないでグダグダしてる間に　夏が終わってしまったな　あぁ　今年こそ残すよ　爪痕をって　あ...,0,0,何 も し ない で グダグダ し てる 間 に 夏 が 終わっ て しまっ た な ...,SUPER BEAVER
31,I Don't Like Monday,DIAMOND,We found our light in the brightest night　貴方と見...,0,0,We found our light in the brightest night 貴方...,I Don't Like Monday
37,I Don't Like Monday,Do Ya?,It's Friday night　On the Internet　Everyone is ...,0,0,It ' s Friday night On the Internet Everyo...,I Don't Like Monday
54,I Don't Like Monday,PLEASE,"Please, babe　Come closer to my side and hold m...",0,0,"Please , babe Come closer to my side and hol...",ELLEGADEN
56,I Don't Like Monday,HONNE,本当はスパイ映画の主役みたいな　キザでロマンティックな台詞考えてた 君を初めて招いたこの狭...,1,0,本当は スパイ 映画 の 主役 みたい な キザ で ロマンティック な 台詞 考え て...,SUPER BEAVER
58,I Don't Like Monday,MR.CLEVER,冷静さ保って　クレバーにいこうYO　ハイブランドまとって　カッコつけたいね　あれもこれもって...,1,0,冷静 さ 保っ て クレバー に いこ う YO ハイ ブランド まとっ て カ...,YUKI
61,I Don't Like Monday,MEMORIES,"We are treasure, we are forever　共に未来(あす)を描くのさ　...",0,0,"We are treasure , we are forever 共に 未来 ( あす ...",I Don't Like Monday


In [43]:
test_data["artist_pred"] = test_data["pred"].map(map_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [44]:
from sklearn.metrics import accuracy_score

In [45]:
accuracy_score(test_data["artist_true"], test_data["artist_pred"])

0.6

In [None]:
#特徴量を追加

In [48]:
lyrics_train["len"] = df_data["len"]
lyrics_train["len_space"] = df_data["len_space"]

In [49]:
#. 学習データを作成
x = lyrics_train.drop("artist", axis=1) #. ベクトル化したデータ
y = df_data["artist"] #. 数値化した目的変数

In [50]:
#まずはホールドアウト法を使ってみる
from sklearn.model_selection import train_test_split

#test_size=0.2
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=0)

In [51]:
#lightgbm用にデータセットを作成
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [52]:
#学習

# 学習条件を設定
model = lgb.LGBMClassifier(objective='multiclass')

result = model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric="multi_logloss")

[1]	valid_0's multi_logloss: 1.61094	valid_0's multi_logloss: 1.61094
[2]	valid_0's multi_logloss: 1.54067	valid_0's multi_logloss: 1.54067
[3]	valid_0's multi_logloss: 1.48249	valid_0's multi_logloss: 1.48249
[4]	valid_0's multi_logloss: 1.43636	valid_0's multi_logloss: 1.43636
[5]	valid_0's multi_logloss: 1.39605	valid_0's multi_logloss: 1.39605
[6]	valid_0's multi_logloss: 1.35597	valid_0's multi_logloss: 1.35597
[7]	valid_0's multi_logloss: 1.32466	valid_0's multi_logloss: 1.32466
[8]	valid_0's multi_logloss: 1.29953	valid_0's multi_logloss: 1.29953
[9]	valid_0's multi_logloss: 1.2704	valid_0's multi_logloss: 1.2704
[10]	valid_0's multi_logloss: 1.24184	valid_0's multi_logloss: 1.24184
[11]	valid_0's multi_logloss: 1.21541	valid_0's multi_logloss: 1.21541
[12]	valid_0's multi_logloss: 1.19596	valid_0's multi_logloss: 1.19596
[13]	valid_0's multi_logloss: 1.17072	valid_0's multi_logloss: 1.17072
[14]	valid_0's multi_logloss: 1.15005	valid_0's multi_logloss: 1.15005
[15]	valid_0's mu

In [54]:
lyrics_test["len"] = test_data["len"]
lyrics_test["len_space"] = test_data["len_space"]

In [55]:
#テストデータで予測をする
y_pred = model.predict(lyrics_test)

In [56]:
test_data["artist_pred"] = test_data["pred"].map(map_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [57]:
accuracy_score(test_data["artist_true"], test_data["artist_pred"])

0.6

In [58]:
test_data

Unnamed: 0,artist,song,lyrics,flg,artist_true,lyrics_wakati,pred,artist_pred,len,len_space
6,I Don't Like Monday,美しき世界,泣いていようと　笑ってようと　命という旋律が紡いだ奇妙な世界　嗚呼　皆んな皆んな　踊ってる　...,0,0,泣い て いよ う と 笑っ てよ う と 命 という 旋律 が 紡い だ 奇妙 な...,Galileo Galilei,3,501,0
7,I Don't Like Monday,ENTERTAINER,合い鍵は近々　ポストに入れといてなんて　味気ない最後 どんなに情熱的な恋をしたって　結局終...,0,0,合い鍵 は 近々 ポスト に 入れ とい て なんて 味気 ない 最後 どん...,YUKI,2,406,13
12,I Don't Like Monday,gift,外を出たら突然に　今年初めてみる雪　懐かしくて切ないような匂い 恥ずかしいから街中で　子供...,0,0,外 を 出 たら 突然 に 今年 初めて みる 雪 懐かしく て 切ない よう な ...,SUPER BEAVER,1,521,15
22,I Don't Like Monday,スクロール。(Arranged by tofubeats),何もしないでグダグダしてる間に　夏が終わってしまったな　あぁ　今年こそ残すよ　爪痕をって　あ...,0,0,何 も し ない で グダグダ し てる 間 に 夏 が 終わっ て しまっ た な ...,SUPER BEAVER,1,456,0
31,I Don't Like Monday,DIAMOND,We found our light in the brightest night　貴方と見...,0,0,We found our light in the brightest night 貴方...,I Don't Like Monday,0,899,105
37,I Don't Like Monday,Do Ya?,It's Friday night　On the Internet　Everyone is ...,0,0,It ' s Friday night On the Internet Everyo...,I Don't Like Monday,0,1911,302
54,I Don't Like Monday,PLEASE,"Please, babe　Come closer to my side and hold m...",0,0,"Please , babe Come closer to my side and hol...",ELLEGADEN,5,1410,238
56,I Don't Like Monday,HONNE,本当はスパイ映画の主役みたいな　キザでロマンティックな台詞考えてた 君を初めて招いたこの狭...,1,0,本当は スパイ 映画 の 主役 みたい な キザ で ロマンティック な 台詞 考え て...,SUPER BEAVER,1,466,0
58,I Don't Like Monday,MR.CLEVER,冷静さ保って　クレバーにいこうYO　ハイブランドまとって　カッコつけたいね　あれもこれもって...,1,0,冷静 さ 保っ て クレバー に いこ う YO ハイ ブランド まとっ て カ...,YUKI,2,971,70
61,I Don't Like Monday,MEMORIES,"We are treasure, we are forever　共に未来(あす)を描くのさ　...",0,0,"We are treasure , we are forever 共に 未来 ( あす ...",I Don't Like Monday,0,1382,179
