# 第7章: 機械学習

本章では, [Stanford Sentiment Treebank (SST)](https://nlp.stanford.edu/sentiment/) データセットを用い, 評判分析器 (ポジネガ分類器) を構築する. ここでは処理を簡略化するため, [General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) ベンチマークで配布されているSSTデータセットを用いる.

## 60. データの入手・整形

GLUEのウェブサイトから[SST-2](https://dl.fbaipublicfiles.com/glue/data/SST-2.zip)データセットを取得せよ。学習データ（`train.tsv`）と検証データ（`dev.tsv`）のぞれぞれについて、ポジティブ (1) とネガティブ (0) の事例数をカウントせよ.

In [None]:
# !wget https://dl.fbaipublicfiles.com/glue/data/SST-2.zip
# !unzip SST-2.zip

--2025-04-10 17:12:36--  https://dl.fbaipublicfiles.com/glue/data/SST-2.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘dl.fbaipublicfiles.com’
unzip:  cannot find or open https://dl.fbaipublicfiles.com/glue/data/SST-2.zip, https://dl.fbaipublicfiles.com/glue/data/SST-2.zip.zip or https://dl.fbaipublicfiles.com/glue/data/SST-2.zip.ZIP.


In [None]:
import pandas as pd

train_df = pd.read_table("SST-2/train.tsv", delimiter="\t")
dev_df = pd.read_table("SST-2/dev.tsv", delimiter="\t")

print("# train.tsv")
print(f"Positive: {(train_df["label"]==1).sum()}")
print(f"Negative: {(train_df["label"]==0).sum()}")

print("# dev.tsv")
print(f"Positive: {(dev_df["label"]==1).sum()}")
print(f"Negative: {(dev_df["label"]==0).sum()}")

## 61. 特徴ベクトル

Bag of Words (BoW) に基づき, 学習データ (train.tsv) および検証データ (dev.tsv) のテキストを特徴ベクトルに変換したい. ここで, ある事例のテキストの特徴ベクトルは, テキスト中に含まれる単語 (スペース区切りのトークン) の出現頻度で構成する. 例えば, "too loud , too goofy"というテキストに対応する特徴ベクトルは, 以下のような辞書オブジェクトで表現される.

```python
{'too': 2, 'loud': 1, ',': 1, 'goofy': 1}
```

各事例はテキスト, 特徴ベクトル, ラベルを格納した辞書オブジェクトでまとめておく. 例えば, 先ほどの"too loud , too goofy"に対してラベル"0" (ネガティブ) が付与された事例は, 以下のオブジェクトで表現される.

```python
{'text': 'too loud , too goofy', 'label': '0', 'feature': {'too': 2, 'loud': 1, ',': 1, 'goofy': 1}}
```

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

def create_BoW_dict(df):
    data = []
    for i in range(len(df)):
        text = df["sentence"][i]
        label = df["label"][i]
        
        if not isinstance(text, str) or text.strip() == "":
            continue

        try:
            vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b", stop_words=None)
            X = vectorizer.fit_transform([text])
            vocab = vectorizer.get_feature_names_out()
            arr = X.toarray()[0]

            feature_dict = {word: arr[idx] for idx, word in enumerate(vocab)}
            
            bow_dict = {
                "text": text,
                "label": label,
                "feature": feature_dict
            }
            data.append(bow_dict)
        except ValueError as e:
            print(f"Skipping row {i} due to error: {e}")
            
    return data

def get_feature_vectors(bow_list):
    vocab_set = set()
    for item in bow_list:
        vocab_set.update(item["feature"].keys())
    vocab_list = sorted(vocab_set)

    word2idx = {word: idx for idx, word in enumerate(vocab_list)}

    X = np.zeros((len(bow_list), len(vocab_list)))
    y = []

    for i, item in enumerate(bow_list):
        for word, count in item["feature"].items():
            idx = word2idx[word]
            X[i, idx] = count
        y.append(item["label"])
    
    return X, y, vocab_list

train_df = pd.read_table("SST-2/train.tsv", delimiter="\t")
dev_df = pd.read_table("SST-2/dev.tsv", delimiter="\t")

train_dict = create_BoW_dict(train_df)
dev_dict = create_BoW_dict(dev_df)

X_train, y_train, vocab_train = get_feature_vectors(train_dict)
X_dev, y_dev, vocab_dev = get_feature_vectors(dev_dict)

print(train_df[0])
print(X_train[0])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

def create_BoW_dict(df):
    data = []
    for i in range(len(df)):
        text = df["sentence"][i]
        label = df["label"][i]
        
        if not isinstance(text, str) or text.strip() == "":
            continue

        try:
            vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b|[()]")
            X = vectorizer.fit_transform([text])
            vocab = vectorizer.get_feature_names_out()
            arr = X.toarray()[0]

            feature_dict = {word: arr[idx] for idx, word in enumerate(vocab)}
            
            bow_dict = {
                "text": text,
                "label": label,
                "feature": feature_dict
            }
            data.append(bow_dict)
        except ValueError as e:
            print(f"Skipping row {i} due to error: {e}")
            
    return data

def get_feature_vectors(bow_list, vocab_list=None):
    
    if vocab_list is None:
        vocab_set = set()
        for item in bow_list:
            vocab_set.update(item["feature"].keys())
        vocab_list = sorted(vocab_set)
    else:
        vocab_list = vocab_list

    word2idx = {word: idx for idx, word in enumerate(vocab_list)}

    X = np.zeros((len(bow_list), len(vocab_list)))
    y = []

    for i, item in enumerate(bow_list):
        for word, count in item["feature"].items():
            # Skip words not in training vocabulary
            if word in word2idx:
                idx = word2idx[word]
                X[i, idx] = count
        y.append(item["label"])
    
    return X, y, vocab_list

train_df = pd.read_table("SST-2/train.tsv", delimiter="\t")
dev_df = pd.read_table("SST-2/dev.tsv", delimiter="\t")

train_dict = create_BoW_dict(train_df)
dev_dict = create_BoW_dict(dev_df)

X_train, y_train, vocab_train = get_feature_vectors(train_dict)
X_dev, y_dev, _ = get_feature_vectors(dev_dict, vocab_train)

model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

y_pred = model.predict(X_dev)

accuracy = accuracy_score(y_dev, y_pred)
print("Validation Accuracy:", accuracy)
