## LDA 编程
**编程实现线性判别，并给出西瓜数据集3.0$\alpha$上的结果。**

In [94]:
import numpy as np 


# 输入西瓜数据集
watermelon= np.array([[0.697,0.460,'是'],[0.774,0.376,'是'],[0.634,0.264,'是'],
           [0.608,0.318,'是'],[0.556,0.215,'是'],[0.403,0.237,'是'],
           [0.481,0.149,'是'],[0.437,0.211,'是'],[0.666,0.091,'否'],
           [0.243,0.267,'否'],[0.245,0.057,'否'],[0.343,0.099,'否'],
           [0.639,0.161,'否'],[0.657,0.198,'否'],[0.360,0.370,'否'],
           [0.593,0.042,'否'],[0.719,0.103,'否']])

# 数据处理-提取特征和类别
features = watermelon[:,0:2].astype('float')
labels = watermelon[:,-1]

### 用sklearn 实现LDA

In [19]:
from sklearn.lda import LDA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(features,labels,test_size=0.5,random_state=0 )

clf = LDA()
clf.fit(X_train, y_train)
pred_test = clf.predict(X_test)
score = accuracy_score(y_test, pre_test)


print(score)
print(confusion_matrix(y_test, pred_test))
print(classification_report(y_test, pred_test))

0.666666666667
[[3 2]
 [1 3]]
             precision    recall  f1-score   support

          否       0.75      0.60      0.67         5
          是       0.60      0.75      0.67         4

avg / total       0.68      0.67      0.67         9



### 手动编程实现LDA

In [236]:
# 计算类均值
def class_mean(features_raw, labels_raw):
    mean_vectors = []
    for i in np.unique(labels_raw):
        mean_vectors.append(np.mean(features_raw[labels_raw==i],axis = 0))
    return mean_vectors

# 计算类内散度
def within_scatter(features_raw, labels_raw):
    m = features_raw.shape[1]
    mean_vector = class_mean(features_raw, labels_raw)
    S_w = np.zeros((m,m))
    for i, mean in zip(np.unique(labels),mean_vector):
        mean = np.mat(mean).T
        feature = features_raw[labels_raw==i]
        S_w +=np.dot(feature.T - np.tile(mean,len(feature)),feature - np.tile(mean,len(feature)).T)
    return S_w


def lda_2_class_project_mean(features_raw, labels_raw):
    Sw = within_scatter(features_raw, labels_raw)
    U, sigma, VT = np.linalg.svd(np.mat(Sw))
    Sw_inv = VT.T*np.linalg.inv(np.diag(sigma))*U.T
    mean_vector = class_mean(features_raw, labels_raw)
    w = np.dot(Sw_inv,np.mat(mean[0]-mean[1]).T)
    mean_new = np.zeros((2,1))
    mean_new[0] = np.dot(w.T,np.mat(mean_vector[0]).T)
    mean_new[1] = np.dot(w.T,np.mat(mean_vector[1]).T)

    return w, mean_new

def lda_test(features_raw, labels_raw,features_test, labels_test):
    w, mean_new = lda_2_class_project_mean(features_raw, labels_raw)
    correct = 0
    labels_pred = []
    for i in range(len(features_test)):
        proj = np.dot(w.T,features_test[i])
        if abs(proj - mean_new[0]) <= abs(proj - mean_new[1]):
            pred = np.unique(labels_raw)[0]
            labels_pred.append(np.unique(labels_raw)[0])  
        else:
            pred = np.unique(labels_raw)[1]
            labels_pred.append(np.unique(labels_raw)[1])
        if pred == labels_test[i]:
            correct += 1
    accuracy = correct /len(features_test)
    return accuracy,labels_pred

# 利用上面sklearn拆分的数据集进行计算
acc, y_pred = lda_test(X_train, y_train,X_test, y_test)

print ('accuracy is:',acc )
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

accuracy is: 0.6666666666666666
[[3 2]
 [1 3]]
             precision    recall  f1-score   support

          否       0.75      0.60      0.67         5
          是       0.60      0.75      0.67         4

avg / total       0.68      0.67      0.67         9

