In [102]:
# 测试
from sklearn import datasets
from sklearn import model_selection
from sklearn.metrics import f1_score
import numpy as np

iris = datasets.load_iris()
data = iris['data']
target = iris['target']
X_train, X_test, y_train, y_test = model_selection.train_test_split(data, target, test_size=0.2,random_state=0)
print(X_train.shape,y_train.shape)
print(np.unique(y_train))

(120, 4) (120,)
[0 1 2]


In [103]:
# 数据进行分箱操作，
#  x < a -- 0
#  a < x < b --- 1
# 依次类推，将连续的属性取值离散化
class DataBinWrapper(object):
    def __init__(self,max_bin = 10):
        super().__init__()
        self.max_bin = max_bin
        self.XrangeMap = None
    
    def fit(self,X):
        _ , n_features = X.shape
        self.XrangeMap = [[] for _ in range(n_features)]
        for index in range(0,n_features):
            # 找出对应的属性
            tmp = sorted(X[:,index])
            for percent in range(1,self.max_bin):
                # 找到相应的分位数10%-90%分位数
                percent_value = np.percentile(tmp,(1.0 * percent / self.max_bin) * 100 // 1)
                self.XrangeMap[index].append(percent_value)
            self.XrangeMap[index] = sorted(list(self.XrangeMap[index]))
    
    def transform(self,X):
        # 如果只有一个样本，将x的每一维都进行分箱
        # np.digitize返回的是给的元素在列表中的索引区间，从1开始
        if X.ndim == 1:
            return np.asarray([np.digitize(X[i],self.XrangeMap[i]) for i in range(X.shape[0])])
        else:
            return np.asarray([np.digitize(X[:,i],self.XrangeMap[i]) for i in range(X.shape[1])]).T

In [104]:
# 将训练数据都进行分箱操作
data_bin_wrapper=DataBinWrapper(max_bin=10)
data_bin_wrapper.fit(X_train)
X_train=data_bin_wrapper.transform(X_train)
X_test=data_bin_wrapper.transform(X_test)
X_train[:5,:]

array([[7, 6, 8, 7],
       [3, 5, 5, 6],
       [2, 8, 2, 2],
       [6, 5, 6, 7],
       [7, 2, 8, 8]])

In [105]:
X_test[:5,:]

array([[5, 2, 7, 9],
       [5, 0, 4, 3],
       [3, 9, 1, 2],
       [9, 3, 9, 7],
       [1, 8, 2, 2]])

In [106]:
class SimpleFeatureFunction(object):
    def __init__(self):
        self.feature_funcs=set()
    
    def build_feature_funcs(self,X,Y):
        n_sample = X.shape[0]
        # 对每个元素的每个特征区间和y构建指示函数
        for index in range(n_sample):
            x = X[index,:].tolist()
            for feature_index in range(len(x)):
                self.feature_funcs.add(tuple([feature_index,x[feature_index],Y[index]]))

    def get_feature_funcs_num(self):
        return len(self.feature_funcs)
    
    # 返回命中的特诊函数
    def match_feature_function_indices(self,x,y):
        match_indices = []
        index = 0
        for func in self.feature_funcs:
            feature_index,feature_value,y_value = func
            if y_value == y and x[feature_index]==feature_value:
                match_indices.append(index)
            index += 1
        return match_indices

In [107]:
import utils
class MaxEntropy(object):
    def __init__(self,feature_func,epochs=5,eta=0.01):
        super().__init__()
        self.feature_func = feature_func
        self.epochs = epochs
        self.eta = eta
        self.class_num = None # 类别数量
        # 经验联合概率分布P_tiled(x,y)
        self.Pxy = {}
        # 经验边缘分布P_tiled(x)
        self.Px = {}
        # 每个特征函数的权重
        self.w = None
    
    def init_params(self,X,Y):
        n_sample = X.shape[0]
        self.class_num = np.max(Y) + 1
        
        
        # 初始化联合概率分布： P(x)=count(X = x) / N 和 联合概率分布P(x,y)=count(X=x,Y=y) / N
        for index in range(n_sample):
            range_indices = X[index,:].tolist()
            
            if self.Px.get(tuple(range_indices)) is None:
                self.Px[tuple(range_indices)] = 1
            else:
                self.Px[tuple(range_indices)] += 1
                
            if self.Pxy.get(tuple(range_indices + [Y[index]])) is None:
                self.Pxy[tuple(range_indices + [Y[index]])] = 1
            else :
                self.Pxy[tuple( range_indices + [Y[index]] )] += 1

        for key,value in self.Px.items():
            self.Px[key] = 1.0 * self.Px[key] / n_sample
        for key,value in self.Pxy.items():
            self.Pxy[key] = 1.0 * self.Pxy[key] / n_sample
                
        # 初始化每个特征函数的权重 
        self.w = np.zeros(self.feature_func.get_feature_funcs_num())
        
    def _sum_exp_w_on_all_y(self,x):
        sum_w = 0
        for y in range(self.class_num):
            tmp_exp_w = self._sum_exp_w_on_y(x,y)
            sum_w += tmp_exp_w
        return sum_w
    
    def _sum_exp_w_on_y(self,x,y):
        tmp_w = 0
        match_func_index = self.feature_func.match_feature_function_indices(x,y)
        for index in match_func_index:
            tmp_w += self.w[index]
        return np.exp(tmp_w)
    
    def _P_w_y_conditioned_x(self,x,y):
        return self._sum_exp_w_on_y(x,y) / (1e-7 + self._sum_exp_w_on_all_y(x))
    
    def fit(self,X,y):
        self.eta = max(self.eta,1.0 / np.sqrt(X.shape[0]))
        self.init_params(X,y)
        x_y = np.c_[X,y]
        for epoch in range(self.epochs):
            count = 0
            np.random.shuffle(x_y)
            # 每个epoch内将所有样本参与w的更新
            for index in range(x_y.shape[0]):
                count += 1
                x_point = x_y[index,:-1]
                y_point = x_y[index,-1:][0]
                # 计算经验联合概率
                p_xy = self.Pxy.get(tuple(x_point.tolist() + [y_point]))
                p_x = self.Px.get(tuple(x_point))
                
                dw = np.zeros_like(self.w)
                
                match_feature_func_indices = self.feature_func.match_feature_function_indices(x_point,y_point)
                
                if len(match_feature_func_indices) == 0:
                    continue
                if p_xy is not None:
                    for index in match_feature_func_indices:
                        dw[index] += p_xy
                if p_x is not None:
                    for y_i in range(self.class_num):
                        match_func_indices = self.feature_func.match_feature_function_indices(x_point,y_i)
                        for index in match_func_indices:
                            dw[index] -= p_x * self._P_w_y_conditioned_x(x_point,y_i)
                
                # 因为是求最大值所以使用梯度上升
                self.w = self.w + self.eta * dw
                
                if count % (X.shape[0] // 4) == 0:
                    print("processing:\tepoch:" + str(epoch + 1) + "/" + str(self.epochs) + ",percent:" + str(
                        count) + "/" + str(X.shape[0]))
    
    
    def predict_prob(self,X):
        prob = []
        for x_point in X:
            y_tmp = []
            for y in range(self.class_num):
                y_tmp.append(self._P_w_y_conditioned_x(x_point,y))
            prob.append(y_tmp)
        return np.asarray(prob)    
    
    def predict(self,X):
        return np.argmax(self.predict_prob(X),axis=1)

In [108]:
feature_function = SimpleFeatureFunction()
feature_function.build_feature_funcs(X_train,y_train)

maxentropy = MaxEntropy(feature_func=feature_function)
maxentropy.fit(X_train,y_train)
y = maxentropy.predict(X_test)

print('f1:',f1_score(y_test,y,average='macro'))

processing:	epoch:1/5,percent:30/120
processing:	epoch:1/5,percent:60/120
processing:	epoch:1/5,percent:90/120
processing:	epoch:1/5,percent:120/120
processing:	epoch:2/5,percent:30/120
processing:	epoch:2/5,percent:60/120
processing:	epoch:2/5,percent:90/120
processing:	epoch:2/5,percent:120/120
processing:	epoch:3/5,percent:30/120
processing:	epoch:3/5,percent:60/120
processing:	epoch:3/5,percent:90/120
processing:	epoch:3/5,percent:120/120
processing:	epoch:4/5,percent:30/120
processing:	epoch:4/5,percent:60/120
processing:	epoch:4/5,percent:90/120
processing:	epoch:4/5,percent:120/120
processing:	epoch:5/5,percent:30/120
processing:	epoch:5/5,percent:60/120
processing:	epoch:5/5,percent:90/120
processing:	epoch:5/5,percent:120/120
f1: 0.9188034188034188


In [109]:
class UserDefinedFeatureFunction(object):
    def __init__(self):
        self.feature_funcs = set()
    
    def build_feature_funcs(self,X,y):
        n_sample = X.shape[0]
        for index in range(n_sample):
            x = X[index,:].tolist()
            for feature_index in range(len(x)):
                self.feature_funcs.add(tuple([feature_index,x[feature_index],y[index]]))
                # 构造两个特征和y之间的关系
                for new_feature_index in range(len(x)):
                    if new_feature_index != feature_index:
                        self.feature_funcs.add(tuple([feature_index,x[feature_index],new_feature_index,x[new_feature_index],y[index]]))
                    
    def get_feature_funcs_num(self):
        return len(self.feature_funcs)
    
    # 返回命中的特诊函数
    def match_feature_function_indices(self,x,y):
        match_indices = []
        for index,item in enumerate(self.feature_funcs):
            if len(item) == 5:
                index1,value1,index2,value2,y_value = item
                if x[index1]==value1 and x[index2] == value2 and y == y_value:
                    match_indices.append(index)
            else:
                index1,value1,y_value = item
                if x[index1]==value1 and y == y_value:
                    match_indices.append(index)
        return match_indices

In [110]:
# 检验
feature_func=UserDefinedFeatureFunction()
feature_func.build_feature_funcs(X_train,y_train)

maxEnt = MaxEntropy(feature_func=feature_func)
maxEnt.fit(X_train, y_train)
y = maxEnt.predict(X_test)

print('f1:', f1_score(y_test, y, average='macro'))

processing:	epoch:1/5,percent:30/120
processing:	epoch:1/5,percent:60/120
processing:	epoch:1/5,percent:90/120
processing:	epoch:1/5,percent:120/120
processing:	epoch:2/5,percent:30/120
processing:	epoch:2/5,percent:60/120
processing:	epoch:2/5,percent:90/120
processing:	epoch:2/5,percent:120/120
processing:	epoch:3/5,percent:30/120
processing:	epoch:3/5,percent:60/120
processing:	epoch:3/5,percent:90/120
processing:	epoch:3/5,percent:120/120
processing:	epoch:4/5,percent:30/120
processing:	epoch:4/5,percent:60/120
processing:	epoch:4/5,percent:90/120
processing:	epoch:4/5,percent:120/120
processing:	epoch:5/5,percent:30/120
processing:	epoch:5/5,percent:60/120
processing:	epoch:5/5,percent:90/120
processing:	epoch:5/5,percent:120/120
f1: 0.957351290684624


In [112]:
print(maxEnt.feature_func.get_feature_funcs_num())

693
