In [5]:
pip install torch torchvision

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix

train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("valid.csv")
test_df  = pd.read_csv("test.csv")

print(train_df.head())
print(train_df["label"].value_counts())


                                                text  label
0  states slow to shut down weak teacher educatio...      0
1    drone places fresh kill on steps of white house      1
2  report: majority of instances of people gettin...      1
3  sole remaining lung filled with rich, satisfyi...      1
4                       the gop's stockholm syndrome      0
0    11248
1    10216
Name: label, dtype: int64


In [7]:
# 用 CountVectorizer 拿词表（只在 train 上 fit）
max_features = 20000  # 词表上限，可以调大
vectorizer = CountVectorizer(max_features=max_features, analyzer="word")
vectorizer.fit(train_df["text"])

# 建 word2idx 字典
word2idx = {word: idx + 2 for idx, word in enumerate(vectorizer.get_feature_names_out())}
word2idx["<pad>"] = 0
word2idx["<unk>"] = 1

vocab_size = len(word2idx)
print("vocab_size =", vocab_size)


vocab_size = 20002


In [8]:
max_len = 40  # headline 很短，40 足够

def encode(sentence):
    # 按空格切词
    tokens = sentence.lower().split()
    ids = [word2idx.get(w, word2idx["<unk>"]) for w in tokens]
    
    if len(ids) > max_len:
        ids = ids[:max_len]
    else:
        ids += [word2idx["<pad>"]] * (max_len - len(ids))
    
    return torch.tensor(ids, dtype=torch.long)


In [9]:
X_train = torch.stack([encode(t) for t in train_df["text"]])
X_valid = torch.stack([encode(t) for t in valid_df["text"]])
X_test  = torch.stack([encode(t) for t in test_df["text"]])

y_train = torch.tensor(train_df["label"].values, dtype=torch.long)
y_valid = torch.tensor(valid_df["label"].values, dtype=torch.long)
y_test  = torch.tensor(test_df["label"].values, dtype=torch.long)

print(X_train.shape, y_train.shape)


torch.Size([21464, 40]) torch.Size([21464])


In [10]:
batch_size = 64

train_ds = TensorDataset(X_train, y_train)
valid_ds = TensorDataset(X_valid, y_valid)
test_ds  = TensorDataset(X_test,  y_test)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=batch_size)
test_loader  = DataLoader(test_ds,  batch_size=batch_size)


In [11]:
def evaluate(model, loader):
    model.eval()
    preds, labels = [], []
    correct, total = 0, 0
    
    with torch.no_grad():
        for X_batch, y_batch in loader:
            outputs = model(X_batch)
            pred = outputs.argmax(dim=1)
            
            preds.extend(pred.tolist())
            labels.extend(y_batch.tolist())
            
            correct += (pred == y_batch).sum().item()
            total += len(y_batch)
    
    acc = correct / total
    return preds, labels, acc


In [12]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2, 2)  # 二分类输出 2 维 logits

    def forward(self, x):
        # x: (batch, max_len)
        x = self.embedding(x)        # (batch, max_len, embed_dim)
        _, (h, _) = self.lstm(x)     # h: (num_layers*2, batch, hidden_dim)
        # 取双向最后一层的两个方向的 hidden
        h_fwd = h[-2]                # (batch, hidden_dim)
        h_bwd = h[-1]                # (batch, hidden_dim)
        out = torch.cat([h_fwd, h_bwd], dim=1)  # (batch, 2*hidden_dim)
        out = self.dropout(out)
        out = self.fc(out)           # (batch, 2)
        return out


In [13]:
lstm_model = LSTMClassifier(vocab_size=vocab_size, embed_dim=100, hidden_dim=64)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=1e-3)


In [14]:
best_val_acc = 0.0
patience = 2
wait = 0

num_epochs = 10

for epoch in range(num_epochs):
    lstm_model.train()
    epoch_loss = 0.0
    
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        
        outputs = lstm_model(X_batch)
        loss = criterion(outputs, y_batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # 每个 epoch 后在 valid 上评估
    preds, labels, val_acc = evaluate(lstm_model, valid_loader)
    print(f"[LSTM] Epoch {epoch+1}/{num_epochs}, "
          f"train_loss={epoch_loss:.3f}, val_acc={val_acc:.4f}")
    
    # early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_lstm_state = lstm_model.state_dict()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping triggered for LSTM!")
            break

# 加载验证集上表现最好的参数
lstm_model.load_state_dict(best_lstm_state)


[LSTM] Epoch 1/10, train_loss=182.140, val_acc=0.7989
[LSTM] Epoch 2/10, train_loss=134.067, val_acc=0.8128
[LSTM] Epoch 3/10, train_loss=102.290, val_acc=0.8017
[LSTM] Epoch 4/10, train_loss=76.218, val_acc=0.8142
[LSTM] Epoch 5/10, train_loss=54.926, val_acc=0.8059
[LSTM] Epoch 6/10, train_loss=37.380, val_acc=0.8184
[LSTM] Epoch 7/10, train_loss=25.476, val_acc=0.8073
[LSTM] Epoch 8/10, train_loss=17.038, val_acc=0.7975
Early stopping triggered for LSTM!


<All keys matched successfully>

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

# VALID
preds_valid_lstm, labels_valid_lstm, val_acc_lstm = evaluate(lstm_model, valid_loader)
print("LSTM on VALID:")
print(classification_report(labels_valid_lstm, preds_valid_lstm))
print(confusion_matrix(labels_valid_lstm, preds_valid_lstm))

# TEST
preds_test_lstm, labels_test_lstm, test_acc_lstm = evaluate(lstm_model, test_loader)
print("LSTM on TEST:")
print(classification_report(labels_test_lstm, preds_test_lstm))
print(confusion_matrix(labels_test_lstm, preds_test_lstm))


LSTM on VALID:
              precision    recall  f1-score   support

           0       0.79      0.82      0.80       360
           1       0.81      0.77      0.79       356

    accuracy                           0.80       716
   macro avg       0.80      0.80      0.80       716
weighted avg       0.80      0.80      0.80       716

[[296  64]
 [ 81 275]]
LSTM on TEST:
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       526
           1       0.80      0.74      0.77       440

    accuracy                           0.80       966
   macro avg       0.80      0.80      0.80       966
weighted avg       0.80      0.80      0.80       966

[[446  80]
 [113 327]]
