## LDA 编程

### 1. 二分类下的LDA
**编程实现线性判别，并给出西瓜数据集3.0$\alpha$上的结果。**

In [60]:
import numpy as np 


# 输入西瓜数据集
watermelon= np.array([[0.697,0.460,'是'],[0.774,0.376,'是'],[0.634,0.264,'是'],
           [0.608,0.318,'是'],[0.556,0.215,'是'],[0.403,0.237,'是'],
           [0.481,0.149,'是'],[0.437,0.211,'是'],[0.666,0.091,'否'],
           [0.243,0.267,'否'],[0.245,0.057,'否'],[0.343,0.099,'否'],
           [0.639,0.161,'否'],[0.657,0.198,'否'],[0.360,0.370,'否'],
           [0.593,0.042,'否'],[0.719,0.103,'否']])

# 数据处理-提取特征和类别
features = watermelon[:,0:2].astype('float')
labels = watermelon[:,-1]

### 用sklearn 实现LDA

In [61]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(features,labels,test_size=0.5,random_state=0 )

clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)
pred_test = clf.predict(X_test)
score = accuracy_score(y_test, pred_test)


print(score)
print(confusion_matrix(y_test, pred_test))
print(classification_report(y_test, pred_test))

0.666666666667
[[3 2]
 [1 3]]
             precision    recall  f1-score   support

          否       0.75      0.60      0.67         5
          是       0.60      0.75      0.67         4

avg / total       0.68      0.67      0.67         9



### 手动编程实现LDA

In [84]:
# 计算类均值
def class_mean(features_raw, labels_raw):
    mean_vectors = []
    for i in np.unique(labels_raw):
        mean_vectors.append(np.mean(features_raw[labels_raw==i],axis = 0))
    return mean_vectors

# 计算类内散度
def within_scatter(features_raw, labels_raw):
    m = features_raw.shape[1]
    mean_vector = class_mean(features_raw, labels_raw)
    S_w = np.zeros((m,m))
    for i, mean in zip(np.unique(labels),mean_vector):
        mean = np.mat(mean).T
        feature = features_raw[labels_raw==i]
        S_w +=np.dot(feature.T - np.tile(mean,len(feature)),feature - np.tile(mean,len(feature)).T)
    return S_w


def lda_2_class_project_mean(features_raw, labels_raw):
    Sw = within_scatter(features_raw, labels_raw)
    U, sigma, VT = np.linalg.svd(np.mat(Sw))
    Sw_inv = VT.T*np.linalg.inv(np.diag(sigma))*U.T
    mean_vector = class_mean(features_raw, labels_raw)
    w = np.dot(Sw_inv,np.mat(mean_vector[0]-mean_vector[1]).T)
    mean_new = np.zeros((2,1))
    mean_new[0] = np.dot(w.T,np.mat(mean_vector[0]).T)
    mean_new[1] = np.dot(w.T,np.mat(mean_vector[1]).T)

    return w, mean_new

def lda_test(features_raw, labels_raw,features_test, labels_test):
    w, mean_new = lda_2_class_project_mean(features_raw, labels_raw)
    correct = 0
    labels_pred = []
    for i in range(len(features_test)):
        proj = np.dot(w.T,features_test[i])
        if abs(proj - mean_new[0]) <= abs(proj - mean_new[1]):
            pred = np.unique(labels_raw)[0]
            labels_pred.append(np.unique(labels_raw)[0])  
        else:
            pred = np.unique(labels_raw)[1]
            labels_pred.append(np.unique(labels_raw)[1])
        if pred == labels_test[i]:
            correct += 1
    accuracy = correct /len(features_test)
    return accuracy,labels_pred

# 利用上面sklearn拆分的数据集进行计算
acc, y_pred = lda_test(X_train, y_train,X_test, y_test)

print ('accuracy is:',acc )
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

accuracy is: 0.6666666666666666
[[3 2]
 [1 3]]
             precision    recall  f1-score   support

          否       0.75      0.60      0.67         5
          是       0.60      0.75      0.67         4

avg / total       0.68      0.67      0.67         9



### 2.多分类下的LDA  
这里我们使用Iris数据集。程序来源http://sebastianraschka.com/Articles/2014_python_lda.html

In [63]:
feature_dict = {i:label for i,label in zip(range(4),
                                                ('sepal length in cm',
                                                'sepal width in cm',
                                                'petal length in cm',
                                                'petal width in cm',))}

In [64]:
import pandas as pd

# 读入数据
df = pd.io.parsers.read_csv(
    filepath_or_buffer = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
    header = None,
    sep = ',',)

df.columns = [l for i,l in sorted(feature_dict.items())]+['class label']
df.dropna(how = 'all', inplace = True) # to drop the empty line at file-end



df.tail()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class label
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


把类别转换为1，2，3

In [113]:
from sklearn.preprocessing import LabelEncoder
X = df.values[:,0:4]
y = df['class label'].values  

enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y) + 1  
label_dict = {1:'Setosa', 2:'Versicolor', 3:'Virginica'}

### LDA 步骤：
- Step 1: 计算 d-维均值向量

In [114]:
np.set_printoptions(precision = 4, suppress = True)

mean_vectors = []
for i in range(1,4):
    mean_vectors.append(np.mean(X[y==i],axis=0))
    print('Mean Vector class %s: %s\n'%(i,mean_vectors[i-1]))

Mean Vector class 1: [5.005999999999999 3.4180000000000006 1.464 0.2439999999999999]

Mean Vector class 2: [5.936 2.7700000000000005 4.26 1.3259999999999998]

Mean Vector class 3: [6.587999999999998 2.9739999999999998 5.552 2.026]



In [69]:
np.tile(np.mat(mean_vectors[0]).T,2)

matrix([[5.005999999999999, 5.005999999999999],
        [3.4180000000000006, 3.4180000000000006],
        [1.464, 1.464],
        [0.2439999999999999, 0.2439999999999999]], dtype=object)

- Step 2: 计算散布矩阵
> - 类内散布矩阵Sw

In [78]:
m = X.shape[1]
S_w = np.zeros((m,m))

for i, mean in zip(range(1,4), mean_vectors):
    feature = X[y==i]
    mean = np.mat(mean).T
    S_w  = S_w + np.dot(feature.T - np.tile(mean, len(feature)),feature - np.tile(mean, len(feature)).T)
print('within-class scatter matrix :\n',S_w)

within-class scatter matrix :
 [[38.956199999999995 13.682999999999996 24.614000000000004
  5.6556000000000015]
 [13.682999999999996 17.035000000000004 8.12 4.9132]
 [24.614000000000004 8.12 27.220000000000017 6.2536000000000005]
 [5.6556000000000015 4.9132 6.2536000000000005 6.175599999999998]]


> -    另一种算法

In [124]:
m = X.shape[1]
S_w1 = np.zeros((m,m))

for i, mean in zip(range(1,4),mean_vectors):
    class_sc_mat = np.zeros((m,m))
    for row in X[y==i]:
        row, mean = row.reshape(m,1),mean.reshape(m,1)
        class_sc_mat = class_sc_mat + (row - mean).dot((row-mean).T)
    S_w1 = S_w1 + class_sc_mat
    
print('within-class scatter matrix :\n',S_w1)

within-class scatter matrix :
 [[38.956199999999995 13.682999999999996 24.614000000000004
  5.6556000000000015]
 [13.682999999999996 17.035000000000004 8.12 4.9132]
 [24.614000000000004 8.12 27.220000000000017 6.2536000000000005]
 [5.6556000000000015 4.9132 6.2536000000000005 6.175599999999998]]


- 类间散布矩阵

In [143]:
S_b = np.zeros((m,m))
mean_all = np.mean(X,axis= 0)

for i, mean_i in zip(range(1,4),mean_vectors):
    m_i = len(X[y==i])
    mean_i = mean_i.reshape(m,1)
    mean_all = mean_all.reshape(m,1)
    S_b =  S_b + m_i *(mean_i - mean_all).dot((mean_i - mean_all).T)
    
print('between-class Scatter Matrix:\n', S_b)

between-class Scatter Matrix:
 [[63.21213333333327 -19.534000000000034 165.16466666666656
  71.36306666666663]
 [-19.534000000000034 10.97760000000001 -56.05520000000008
  -22.492400000000032]
 [165.16466666666656 -56.05520000000008 436.6437333333333
  186.90813333333332]
 [71.36306666666663 -22.492400000000032 186.90813333333332
  80.60413333333332]]


- Step 3: 求解矩阵$S_w^{-1}S_b$的广义奇异值问题