In [22]:
from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score  
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import umap


# 加载 IMDb 数据集
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

# 将数据转换为文本
word_index = imdb.get_word_index()
index_word = {v: k for k, v in word_index.items()}

def decode_review(encoded_review):
    return ' '.join([index_word.get(i - 3, '?') for i in encoded_review])

X_train = [decode_review(review) for review in X_train]
X_test = [decode_review(review) for review in X_test]



# 创建词袋模型
class decisionTree():
    def __init__(self,X_train,y_train,X_test,y_test):
        self.clf = DecisionTreeClassifier(max_depth=1)
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.vectorizer = CountVectorizer()
        self.X_train_vec = self.vectorizer.fit_transform(X_train)
        self.X_test_vec = self.vectorizer.transform(X_test)
        # self.clf.fit(self.X_train_vec, y_train)

    def fit(self,sample_weight=None):
        self.clf.fit(self.X_train_vec, y_train, sample_weight=sample_weight)
        
        
    def predict(self, index=None):
        if index == None:
            # 对X_train_vec全部进行预测
            return self.clf.predict(self.X_train_vec)
        return self.clf.predict(self.X_train_vec[index])[0], self.y_train[index]
    
    def predict_t(self, index):
        return self.clf.predict(self.X_test_vec[index])[0], self.y_test[index]
    
class gauss():
    def __init__(self,X_train,y_train,X_test,y_test):
        self.nb_clf = GaussianNB()
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.vectorizer = CountVectorizer()

        self.X_train_vec = self.vectorizer.fit_transform(X_train)
        self.X_test_vec = self.vectorizer.transform(X_test)

        # self.nb_clf.fit(self.X_train_vec.toarray(), y_train)
    def fit(self,sample_weight=None):
        self.nb_clf.fit(self.X_train_vec.toarray(), y_train, sample_weight=sample_weight)

    def predict(self, index=None):
        if index == None:
            # 对X_train_vec全部进行预测
            return self.nb_clf.predict(self.X_train_vec.toarray())
        return self.nb_clf.predict(self.X_train_vec[index].toarray())[0], self.y_train[index]
    def predict_t(self, index):
        return self.nb_clf.predict(self.X_test_vec[index].toarray())[0], self.y_test[index]

    
from sklearn.svm import SVC

class SVMClassifier():
    def __init__(self, X_train, y_train, X_test, y_test):
        print(1)

        self.svm_clf = SVC(kernel='linear')
        print(2)

        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.vectorizer = CountVectorizer()
        
        self.X_train_vec = self.vectorizer.fit_transform(X_train)
        # 将X_train_vec 减少到100维
        self.X_train_vec = umap.UMAP().fit_transform(self.X_train_vec)
        self.X_test_vec = self.vectorizer.transform(X_test)
        # 将X_test_vec 减少到100维
        self.X_test_vec = umap.UMAP().fit_transform(self.X_test_vec)

        print(1)
        print(2)
    def fit(self,sample_weight=None):
        # self.svm_clf.fit(self.X_train_vec, y_train)
        self.svm_clf.fit(self.X_train_vec, y_train, sample_weight=sample_weight)

    def predict(self, index=None):
        if index == None:
            # 对X_train_vec全部进行预测
            return self.svm_clf.predict(self.X_train_vec)
        return self.svm_clf.predict([self.X_train_vec[index]])[0], self.y_train[index]
    
    def predict_t(self, index):
        return self.svm_clf.predict([self.X_test_vec[index]])[0], self.y_test[index]




temp3 = SVMClassifier(X_train,y_train,X_test,y_test)
temp1 = decisionTree(X_train,y_train,X_test,y_test)
temp2 = gauss(X_train,y_train,X_test,y_test)




In [23]:
# # 通过测试集合。测试三个模型的准确率
# t1F = 0
# t2F = 0
# t3F = 0
# t1T = 0
# t2T = 0
# t3T = 0
# for i in range(len(X_test)):
#     t = temp1.predict_t(i)
#     if t[0] == t[1]:
#         t1T += 1
#     else:
#         t1F += 1
#     t = temp2.predict_t(i)
#     if t[0] == t[1]:
#         t2T += 1
#     else:
#         t2F += 1
#     t = temp3.predict_t(i)
#     if t[0] == t[1]:
#         t3T += 1
#     else:
#         t3F += 1
# print("DecisionTreeClassifier accuracy: ", t1T / (t1T + t1F))
# print("GaussianNB accuracy: ", t2T / (t2T + t2F))
# print("SVMClassifier accuracy: ", t3T / (t3T + t3F))


In [24]:
# 通过tmp1, tmp2, tmp3 作为子任务，完成AdaBoost，不得调用库
class AdaBoost():
    def __init__(self, base_classifiers,X_train,y_train,X_test,y_test):
        self.base_classifiers = base_classifiers
        self.classifier_weights = []
        self.errors = []
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def fit(self, T):
        N = len(self.X_train)
        sample_weights = np.ones(N) / N

        for t in range(T):
            # Train a base classifier with the current distribution
            classifier = self.base_classifiers[t]
            classifier.fit(sample_weight=sample_weights)

            # Calculate the error rate
            predictions = classifier.predict()
            error_rate = np.sum(sample_weights[self.y_train != predictions])

            # Calculate the classifier weight
            classifier_weight = 0.5 * np.log((1 - error_rate) / error_rate)
            self.classifier_weights.append(classifier_weight)

            # Update the sample weights
            sample_weights *= np.exp(-classifier_weight * self.y_train * predictions)
            sample_weights /= np.sum(sample_weights)

            self.errors.append(error_rate)

    def predict(self,index):
        # 根据index计算X_train的预测值,通过classifier.predict_t(X)计算 其接受 X_train的下标
        classifier_predictions = np.array([classifier.predict_t(index) for classifier in self.base_classifiers])
        return np.sign(np.dot(self.classifier_weights, classifier_predictions))
    
        # classifier_predictions = np.array([classifier.predict_t(X) for classifier in self.base_classifiers])
        # return np.sign(np.dot(self.classifier_weights, classifier_predictions))
    

# 通过tmp1, tmp2, tmp3 作为子任务，完成AdaBoost，不得调用库
base_classifiers = [temp1, temp2, temp3]
ada = AdaBoost(base_classifiers, X_train, y_train, X_test, y_test)
ada.fit(3)
ada.fit(3)

    


In [25]:
ada.fit(3)


In [26]:
# 测试AdaBoost 的 准确度
t1F = 0
t1T = 0

for i in range(len(X_test)):
    t = ada.predict(i)[0]
    if t == y_test[i]:
        t1T += 1
    else:
        t1F += 1


ValueError: shapes (9,) and (3,2) not aligned: 9 (dim 0) != 3 (dim 0)

In [None]:
print("AdaBoost accuracy: ", t1T / (t1T + t1F))
