In [1]:
import torch
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
from libsvm.svmutil import svm_train, svm_predict, svm_problem, svm_parameter

In [2]:
df = pd.read_csv("./speech_training.data", sep="\s+", header=None)
print("总数据行数: ", len(df))
df.head()

总数据行数:  44610


Unnamed: 0,0,1,2,3,4,5,6
0,0.65307,0.9135,0.99926,0.18748,0.14696,0.16105,-1
1,0.95969,2.0065,3.3661,0.27832,0.25881,0.18512,1
2,0.9135,0.99926,1.362,0.14696,0.16105,0.19826,-1
3,2.0065,3.3661,2.8797,0.25881,0.18512,0.18681,1
4,0.99926,1.362,1.5848,0.16105,0.19826,0.34915,-1


In [4]:
data = []
label = df.iloc[:, -1].tolist()
for i in range(len(df)):
    data.append(df.iloc[i, :-1].to_dict())

In [5]:
idx = [i for i in range(len(df))]
np.random.shuffle(idx)
train_len = int(0.7 * len(df))
test_len = 0.3*len(df)
train_df = df.iloc[idx[:train_len], :]
test_df = df.iloc[idx[train_len:], :]

In [6]:
train_data = []
train_label = train_df.iloc[:, -1].tolist()
for i in range(len(train_df)):
    train_data.append(train_df.iloc[i, :-1].to_dict())
test_data = []
test_label = test_df.iloc[:, -1].tolist()
for i in range(len(test_df)):
    test_data.append(test_df.iloc[i, :-1].to_dict())

In [7]:
best_score = 0
best_model = None
prob = svm_problem(train_label, train_data)
for i in range(5, 10):
    c = str(np.power(2.0, i))
    for j in range(0, 3):
        g = str(np.power(2.0, j))
        param = svm_parameter("-s 0 -t 2 -c %s -g %s" % (c, g))
        model = svm_train(prob, param)
        p_label, p_acc, p_val = svm_predict(test_label, test_data, model)
        if p_acc[0] > best_score:
            best_score = p_acc[0]
            best_model = model

Accuracy = 76.2702% (10208/13384) (classification)
Accuracy = 76.0909% (10184/13384) (classification)
Accuracy = 75.7472% (10138/13384) (classification)
Accuracy = 76.1581% (10193/13384) (classification)
Accuracy = 75.9713% (10168/13384) (classification)
Accuracy = 75.5977% (10118/13384) (classification)
Accuracy = 76.0759% (10182/13384) (classification)
Accuracy = 75.7696% (10141/13384) (classification)
Accuracy = 75.4334% (10096/13384) (classification)


In [48]:
class MyDataset(Dataset):
    def __init__(self, file_name):
        self.df = pd.read_csv(file_name, sep="\s+", header=None)
        self.df.iloc[self.df.iloc[:, -1]==-1, -1] = 0
        self.data = self.df.iloc[:, :-1]
        for i in range(6):
            max_val = self.data.iloc[:, i].max()
            min_val = self.data.iloc[:, i].min()
            self.data.iloc[:, i] = (self.data.iloc[:, i] - min_val) / (max_val - min_val)
        self.data = torch.from_numpy(self.data.values).float()
        self.label = torch.tensor(self.df.iloc[:, -1]).long()
        
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
        sample = {"data": self.data[index, :], "label": self.label[index]}
        return sample

    def __len__(self):
        return len(self.df)

In [49]:
dataset = MyDataset(r"C:\Users\weitao\Desktop\Untitled Folder\ml\speech_training.data")
dataset[0]

{'data': tensor([0.0170, 0.0238, 0.0260, 0.1376, 0.0656, 0.1182]),
 'label': tensor(0)}

In [50]:
train_data_len = int(0.7*len(dataset))
valid_data_len = int(0.1*len(dataset))
test_data_len = len(dataset) - train_data_len - valid_data_len
train_data, valid_data, test_data = random_split(dataset, [train_data_len, valid_data_len, test_data_len])
train_data_loader = DataLoader(train_data, batch_size=100, shuffle=False)
valid_data_loader = DataLoader(valid_data, batch_size=100, shuffle=False)
test_data_loader = DataLoader(test_data, batch_size=100, shuffle=False)

In [51]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(6, 10), 
            nn.BatchNorm1d(10), 
            nn.ReLU(), 
            nn.Linear(10, 10), 
            nn.BatchNorm1d(10), 
            nn.ReLU(), 
            nn.Linear(10, 2)
        )

    def forward(self, data):
        return self.fc(data)

In [58]:
model = MyModel()
crition = nn.CrossEntropyLoss()
optim = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.01)
for epoch in range(10):
    running_loss = 0
    for i, data in enumerate(train_data_loader, 0):
        optim.zero_grad()
        inputs, labels = data["data"], data["label"]
        pred = model(inputs)
        loss = crition(pred, labels)
        loss.backward()
        optim.step()
        running_loss += loss.item()
    print("误差: ", running_loss / (i+1))
    with torch.no_grad():
        correct = 0
        total = 0
        for i, data in enumerate(valid_data_loader, 0):
            inputs, labels = data["data"], data["label"]
            out = model(inputs)
            _, pre = torch.max(out, 1)
            total += labels.size(0)
            correct += (pre == labels).sum().item()
        print("accuary: %.3f"%(correct / total))

误差:  0.5991135992752478
accuary: 0.705
误差:  0.5672440457458313
accuary: 0.708
误差:  0.5580739580785123
accuary: 0.711
误差:  0.5523777610768145
accuary: 0.712
误差:  0.5485320388318632
accuary: 0.715
误差:  0.545586817656843
accuary: 0.716
误差:  0.5432827640265322
accuary: 0.716
误差:  0.5413373159333921
accuary: 0.719
误差:  0.5394644389708583
accuary: 0.721
误差:  0.5373141410442206
accuary: 0.727
