In [1]:
import sklearn
from sklearn import svm
from sklearn.decomposition import PCA 
import matplotlib.pyplot as plt

In [2]:
# ROC
# pred: 1d array
# groundtruth: 1d array
# classifier_name: string, 随便写
def ROC_curve_binary(pred, ground_truth, classifier_name):
    fpr, tpr, _ = sklearn.metrics.roc_curve(ground_truth, pred)
    roc_auc = sklearn.metrics.auc(fpr, tpr)
    
    plt.figure()
    lw = 2
    plt.plot(
        fpr,
        tpr,
        color="orange",
        lw=lw,
        label="ROC curve (area = %0.2f)" % roc_auc,
    )
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve of " + classifier_name)
    plt.legend(loc="lower right")
    plt.show()
    
# PCA on dataset 
# train_data: 2d array-like, n*100, sentence embedding vectors on training set
# train_label: 1d array-like, 100.
# test_data: 2d array-like, n*100, sentence embedding vectors on testing set
# test_label: 1d array-like, 100.
def visualize_data_2D(train_data, train_label, test_data, test_label, embedding_name):
    fig, ax = plt.subplots(2, 1, figsize = (5, 10))
    
    pca_2d = PCA(n_components=2)
    pca_2d.fit(train_data)
    print("Variance ratio after 2D decomposition on training set:")
    print(pca_2d.explained_variance_ratio_)
    pca_2d.fit(test_data)
    print("Variance ratio after 2D decomposition on testing set:")
    print(pca_2d.explained_variance_ratio_)
    
    train_2d = pca_2d.transform(train_data)
    test_2d = pca_2d.transform(test_data)
    
    train_2d_pos = []
    train_2d_neg = []
    test_2d_pos = []
    test_2d_neg = []
    
    for i in range(len(train_2d)):
        if train_label[i] == 1:
            train_2d_pos.append(train_2d[i])
        else:
            train_2d_neg.append(train_2d[i])
            
    for i in range(len(test_2d)):
        if test_label[i] == 1:
            test_2d_pos.append(test_2d[i])
        else:
            test_2d_neg.append(test_2d[i])
        
    train_2d_pos = np.array(train_2d_pos)
    train_2d_neg = np.array(train_2d_neg)
    test_2d_pos = np.array(test_2d_pos)
    test_2d_neg = np.array(test_2d_neg)
    
    ax[0].scatter(train_2d_pos[:, 0], train_2d_pos[:, 1], c='orange', marker='o', label='normal')
    ax[0].scatter(train_2d_neg[:, 0], train_2d_neg[:, 1], c='navy', marker='^', label='hate')
    ax[0].set_title("Positive and negative 2D samples on training set, " + embedding_name)
    
    ax[1].scatter(test_2d_pos[:, 0], test_2d_pos[:, 1], c='orange', marker='o', label='normal')
    ax[1].scatter(test_2d_neg[:, 0], test_2d_neg[:, 1], c='navy', marker='^', label='hate')
    ax[1].set_title("Positive and negative 2D samples on testing set, " + embedding_name)
     
    ax[0].legend()
    ax[1].legend()

# same as above
def visualize_data_3D(train_data, train_label, test_data, test_label, embedding_name):
    fig = plt.figure(figsize=(10, 20))
    ax = fig.add_subplot(2, 1, 1, projection='3d')
    ax2 = fig.add_subplot(2, 1, 2, projection='3d')
    
    pca_3d = PCA(n_components=3)
    pca_3d.fit(train_data)
    print("Variance ratio after 3D decomposition on training set:")
    print(pca_3d.explained_variance_ratio_)
    pca_3d.fit(test_data)
    print("Variance ratio after 3D decomposition on testing set:")
    print(pca_3d.explained_variance_ratio_)

    train_3d = pca_3d.transform(train_data)
    test_3d = pca_3d.transform(test_data)

    train_3d_pos = []
    train_3d_neg = []
    test_3d_pos = []
    test_3d_neg = []
    for i in range(len(train_3d)):
        if train_label[i] == 1:
            train_3d_pos.append(train_3d[i])
        else:
            train_3d_neg.append(train_3d[i])
            
    for i in range(len(test_3d)):
        if test_label[i] == 1:
            test_3d_pos.append(test_3d[i])
        else:
            test_3d_neg.append(test_3d[i])
    
    train_3d_pos = np.array(train_3d_pos)
    train_3d_neg = np.array(train_3d_neg)
    test_3d_pos = np.array(test_3d_pos)
    test_3d_neg = np.array(test_3d_neg)

    ax.scatter(train_3d_pos[:, 0], train_3d_pos[:, 1], train_3d_pos[:, 2], c='orange', marker='o', label='normal')
    ax.scatter(train_3d_neg[:, 0], train_3d_neg[:, 1], train_3d_neg[:, 2], c='navy', marker='^', label='hate')
    ax.view_init(elev=10)
    ax.set_title("Positive and negative 3D samples on training set, " + embedding_name)
        
    ax2.scatter(test_3d_pos[:, 0], test_3d_pos[:, 1], test_3d_pos[:, 2], c='orange', marker='o', label='normal')
    ax2.scatter(test_3d_neg[:, 0], test_3d_neg[:, 1], test_3d_neg[:, 2], c='navy', marker='^', label='hate')
    ax2.view_init(elev=10)
    ax2.set_title("Positive and negative 3D samples on testing set, " + embedding_name)
    
    
    ax.legend()
    ax2.legend()