In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#. モジュールの読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#. データの読み込み
df = pd.read_csv("/content/drive/MyDrive/datasienceブログ/221106_lyrics_data.csv", encoding="utf-8")

In [4]:
#. データの確認
df.head(3)

Unnamed: 0,artist,song,lyrics,flg
0,I Don't Like Monday,ダイナマイト,ループしてる同じ　毎日を繰り返し　まるで機械みたい　息を止め動いてる　車の中レディオ　音楽が...,0
1,I Don't Like Monday,愛言葉,出逢った頃とおんなじ様な　馬鹿みたく晴れた空だ　並んで歩いた緑道は　微かに夏の匂いがした ...,1
2,I Don't Like Monday,AITAI,週末の予定はなんにもないよ　あのコとの約束は　キャンセルだし 巷で有名な占いによれば　運気...,0


In [5]:
#. targetの確認
df["flg"].value_counts()

0    484
1     92
Name: flg, dtype: int64

In [6]:
#. artistの種類を確認
df["artist"].unique()

array(["I Don't Like Monday", 'SUPER BEAVER', 'YUKI', 'Galileo Galilei',
       '緑黄色社会', 'ELLEGADEN'], dtype=object)

In [7]:
#. artistを数値化
map_name = {"I Don't Like Monday":0, "SUPER BEAVER":1, "YUKI":2, 'Galileo Galilei':3, '緑黄色社会':4, 'ELLEGADEN':5}
map_idname = {0:"I Don't Like Monday", 1:"SUPER BEAVER'", 2:"YUKI", 3:'Galileo Galilei', 4:'緑黄色社会', 5:'ELLEGADEN'}

df["artist_true"] = df["artist"].map(map_name)
print(df["artist_true"].value_counts())
print(df["artist"].value_counts())

2    170
1    128
3     79
5     75
0     70
4     54
Name: artist_true, dtype: int64
YUKI                   170
SUPER BEAVER           128
Galileo Galilei         79
ELLEGADEN               75
I Don't Like Monday     70
緑黄色社会                   54
Name: artist, dtype: int64


In [8]:
#. test用のデータを取得
tmp = df.groupby("artist_true")
test_df = tmp.apply(lambda x: x.sample(n=10))  #. 各グループから10件取得
test_df = test_df.reset_index(level='artist_true', drop=True)  #. multiindexを解除

In [9]:
#データの確認
test_df.head(3)

Unnamed: 0,artist,song,lyrics,flg,artist_true
31,I Don't Like Monday,DIAMOND,We found our light in the brightest night　貴方と見...,0,0
2,I Don't Like Monday,AITAI,週末の予定はなんにもないよ　あのコとの約束は　キャンセルだし 巷で有名な占いによれば　運気...,0,0
46,I Don't Like Monday,FIRE,カラダ中溶けてしまいそうな感覚は初めて　Oh girl 君はその気にさせるのが上手いね S...,1,0


In [10]:
#dfからtestデータに含まれていないもののみを取得
test_df["kbn"] = 1  #.  testデータと分かるようにフラグ付け

df = pd.merge(df, test_df[["artist", "song", "kbn"]], on = ["artist", "song"], how = "left")
df_data = df[df["kbn"] != 1]
test_data = df[df["kbn"] == 1]

In [11]:
#不要な列を削除
df_data.drop("kbn", axis=1, inplace=True)
test_data.drop("kbn", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [12]:
#データの中身を確認
df_data["lyrics"]

0      ループしてる同じ　毎日を繰り返し　まるで機械みたい　息を止め動いてる　車の中レディオ　音楽が...
1      出逢った頃とおんなじ様な　馬鹿みたく晴れた空だ　並んで歩いた緑道は　微かに夏の匂いがした　　...
3      You have sophisticated eyes　You make me sweete...
5      Maybe it's all right, We can dance all night　あ...
6      泣いていようと　笑ってようと　命という旋律が紡いだ奇妙な世界　嗚呼　皆んな皆んな　踊ってる　...
                             ...                        
571    I am lying You are lying　We keep our backs tur...
572    Wake me up before you leave　I've got an interv...
573    君の手に　上手く馴染むもの　君の目に綺麗に映るもの　それだけでいい　君の手が今も暖かく　君の...
574    She looks smiley　He looks friendly　You maybe n...
575    None of us can help dreaming　Because a dream i...
Name: lyrics, Length: 516, dtype: object

In [15]:
print(df_data["lyrics"][1])
# 空白を確認
print(df_data["lyrics"].str.replace(" ", "●")[1])

出逢った頃とおんなじ様な　馬鹿みたく晴れた空だ　並んで歩いた緑道は　微かに夏の匂いがした　　現実は無情にも　僕らに押し寄せてくるけど　こうしてさ　いつまでも　二人で歩いていけるなら　　君が好きだとかなんて　柄でもないし言えないから　代わりに鼻歌を唄って　空でも見上げているんだよ　「やけにご機嫌な感じね」　そう君が笑ってくれたのなら　見慣れたこの風景だって　愛しさが溢れ出すのさ　　時にお互いの価値観や　未熟な気立てのせいで　心にもない言葉を言って　傷つけ合う日もあるけど　　「君」と呼ぶ　温もりに　触れてみようとするだけでも　戸惑いや恐れなど　効力を無くしてしまうんだ　　夏が終わるまでにきっとあの頃みたいに海へいこう　砂浜にでも寝転んで　二人未来の話をしよう　子供は何人欲しいとか　年老いたらどこに住みたいとか　まだ気が早いかもなって　君と笑い合いたいのさ　　悲しんだり　喜んだり　抱き合ったり　汚し合ったり　つまづいたり　転んだりもしながら　二人で探すのさ　僕らの合言葉　　君が好きだとかなんて　柄でもないし言えないから　代わりに鼻歌を唄って　空でも見上げているんだよ　僕の隣に君がいて　その君の隣に僕がいる　そんな単純な日常が　愛の言葉の意味なのかな
出逢った頃とおんなじ様な　馬鹿みたく晴れた空だ　並んで歩いた緑道は　微かに夏の匂いがした　　現実は無情にも　僕らに押し寄せてくるけど　こうしてさ　いつまでも　二人で歩いていけるなら　　君が好きだとかなんて　柄でもないし言えないから　代わりに鼻歌を唄って　空でも見上げているんだよ　「やけにご機嫌な感じね」　そう君が笑ってくれたのなら　見慣れたこの風景だって　愛しさが溢れ出すのさ　　時にお互いの価値観や　未熟な気立てのせいで　心にもない言葉を言って　傷つけ合う日もあるけど　　「君」と呼ぶ　温もりに　触れてみようとするだけでも　戸惑いや恐れなど　効力を無くしてしまうんだ　　夏が終わるまでにきっとあの頃みたいに海へいこう　砂浜にでも寝転んで　二人未来の話をしよう　子供は何人欲しいとか　年老いたらどこに住みたいとか　まだ気が早いかもなって　君と笑い合いたいのさ　　悲しんだり　喜んだり　抱き合ったり　汚し合ったり　つまづいたり　転んだりもしながら　二人で探すのさ　僕らの合言葉　　君が好きだとかなんて　柄でもないし言えないから　代わりに鼻歌

In [16]:
#文字数のカウント
df_data["len"] = df_data["lyrics"].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
df_data.groupby("artist_true")["len"].agg(["min","mean","max"]) 

Unnamed: 0_level_0,min,mean,max
artist_true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,342,928.516667,2175
1,183,545.177966,839
2,147,499.1625,1262
3,100,419.449275,714
4,358,528.045455,667
5,199,907.076923,1715


YUKI、ガリレオガリレイは短いと100文字代、長くても７14文字と文字数が少ない。
一方でI dont like mondayは長いのは平均が一番高く、最大値も2,000以上となっている。

英語を歌詞に使用することが多い0(i dont like monday)と5(ELLEGADEN)のスペース数が他と比べて圧倒的に多い

In [18]:
#malti_bert

In [19]:
!pip install transformers
import transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 60.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 91.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [20]:
import torch

In [21]:
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

In [22]:
if torch.cuda.is_available():
    print("GPU")
    !nvidia-smi
else:
    print("not GPU")

GPU
Sat Nov 26 05:26:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    46W / 400W |      3MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Pr

In [23]:
#トークナイザの読み込み
model="bert-base-multilingual-cased"

!pip install fugashi
import fugashi
!pip install ipadic
import ipadic

!pip install sentencepiece
import sentencepiece

tokenizer = AutoTokenizer.from_pretrained(model)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fugashi
  Downloading fugashi-1.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (583 kB)
[K     |████████████████████████████████| 583 kB 4.9 MB/s 
[?25hInstalling collected packages: fugashi
Successfully installed fugashi-1.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipadic
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 5.0 MB/s 
[?25hBuilding wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-any.whl size=13556723 sha256=abc1c2e458c177d06d3315531cbd8e413b15e6096ddc72133c087956fd202f9a
  Stored in directory: /root/.cache/pip/wheels/33/8b/99/cf0d27191876637cd3639a560f93aa982d7855ce826c94348b
Successfully built ipadic
Installing collected pac

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [24]:
#トークナイザを用いて文章を符号化
encoding = tokenizer.batch_encode_plus(
                    df_data["lyrics"].tolist(),
                    max_length = 512,
                    padding = "max_length",
                    truncation=True,
                    return_tensors="pt"
)

encoding

{'input_ids': tensor([[  101,  2058, 71247,  ...,     0,     0,     0],
        [  101,  2527,  7741,  ...,     0,     0,     0],
        [  101, 11065, 10529,  ...,     0,     0,     0],
        ...,
        [  101,  2783,  1946,  ...,     0,     0,     0],
        [  101, 11149, 59148,  ...,     0,     0,     0],
        [  101, 86481, 10108,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [25]:
#Bertmodrlのロード
bert = BertForSequenceClassification.from_pretrained(model, num_labels=6)

#debartをGPUに載せる
bert = bert.cuda()

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [26]:
#モデルの概要確認
print(bert.config)

BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version

num_hidden_layers レイヤー数は24
hidden_size 出力は1024次元
max_position_embeddings 最大で入力できるトークン列の長さ

In [27]:
encoding = {k: v.cuda() for k, v in encoding.items()}
labels = torch.tensor(df_data["artist_true"].values).cuda()

In [28]:
with torch.no_grad():
  output = bert.forward(**encoding)
scores = output.logits #分類スコア
label_predication = scores.argmax(-1) #スコアが一番高いラベル
num_correct = (label_predication==labels).sum().item() #正解数
accuracy = num_correct/labels.size(0) #精度

In [29]:
print("predicted labels:")
print(label_predication)
print("accuracy:")
print(accuracy)

predicted labels:
tensor([5, 0, 1, 4, 2, 0, 0, 0, 2, 0, 1, 0, 2, 2, 1, 0, 2, 2, 2, 5, 0, 2, 4, 2,
        0, 2, 0, 1, 2, 1, 1, 5, 2, 2, 2, 0, 5, 2, 0, 0, 4, 1, 0, 0, 0, 1, 0, 0,
        2, 2, 1, 2, 5, 0, 0, 2, 2, 2, 0, 1, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,
        1, 5, 0, 2, 1, 2, 1, 1, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 0, 0, 2, 2, 5, 2,
        0, 2, 0, 0, 0, 0, 2, 2, 1, 2, 1, 4, 0, 5, 5, 1, 0, 0, 1, 2, 1, 0, 0, 2,
        2, 1, 2, 0, 2, 2, 2, 2, 0, 2, 0, 0, 1, 0, 1, 2, 5, 1, 2, 2, 2, 2, 5, 1,
        2, 2, 2, 2, 2, 5, 2, 0, 2, 5, 2, 0, 2, 0, 2, 2, 1, 2, 0, 5, 2, 0, 2, 0,
        2, 2, 2, 2, 2, 0, 2, 2, 5, 2, 2, 0, 5, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 0,
        5, 0, 0, 2, 2, 2, 5, 0, 0, 2, 2, 0, 0, 5, 0, 0, 2, 5, 0, 2, 0, 0, 0, 2,
        0, 0, 5, 5, 4, 1, 0, 0, 5, 2, 0, 2, 2, 5, 5, 0, 2, 0, 4, 4, 0, 2, 2, 0,
        2, 0, 1, 0, 0, 5, 4, 0, 2, 2, 5, 2, 2, 0, 2, 2, 0, 2, 2, 0, 5, 2, 0, 2,
        0, 0, 5, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 5, 2, 2, 0, 0, 0, 2,
        0, 2, 2, 0, 2,