In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Load a sample movie reviews dataset (you can replace this with your own dataset)
# Assume the dataset has 'text' column for reviews and 'label' column for sentiment.
data = pd.DataFrame({
    'text': ["I loved this movie!", "The plot was confusing.", "Amazing acting!", "Not worth watching."],
    'label': ['positive', 'negative', 'positive', 'negative']
})

# Split the dataset into labeled and unlabeled parts
labeled_data, unlabeled_data = train_test_split(data, test_size=0.5, random_state=42)

# Text vectorization
vectorizer = TfidfVectorizer()
X_labeled = vectorizer.fit_transform(labeled_data['text'])
X_unlabeled = vectorizer.transform(unlabeled_data['text'])

# Define labels for labeled data
y_labeled = labeled_data['label']

# Create a base Naive Bayes classifier
base_classifier = MultinomialNB()

# Create SelfTrainingClassifier
self_training_clf = SelfTrainingClassifier(base_classifier)

# Train the SelfTrainingClassifier on the labeled data
self_training_clf.fit(X_labeled, y_labeled)

# Predict on the unlabeled data
predicted_labels = self_training_clf.predict(X_unlabeled)

# Add the predicted labels to the labeled data
labeled_data = labeled_data.append(pd.DataFrame({'text': unlabeled_data['text'], 'label': predicted_labels}))

# Re-vectorize the combined labeled data
X_combined = vectorizer.transform(labeled_data['text'])
y_combined = labeled_data['label']

# Train the SelfTrainingClassifier on the combined labeled data
self_training_clf.fit(X_combined, y_combined)

# Evaluate the model on the original labeled test set
X_test = vectorizer.transform(["Great movie!", "Terrible plot."])
y_test_pred = self_training_clf.predict(X_test)

print("Predictions for test set:")
for review, prediction in zip(["Great movie!", "Terrible plot."], y_test_pred):
    print(f"Review: '{review}' - Predicted Sentiment: {prediction}")


In [None]:
import numpy as np
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import accuracy_score

# 创建模拟用户关系网络
np.random.seed(42)
num_users = 100
num_known_relations = 20

# 创建用户关系矩阵，-1 表示未知关系
relations = np.full((num_users, num_users), -1)

# 随机选择一些用户之间的关系标记为已知
known_relations_indices = np.random.choice(num_users, size=(num_known_relations, 2), replace=False)
relations[known_relations_indices[:, 0], known_relations_indices[:, 1]] = 1
relations[known_relations_indices[:, 1], known_relations_indices[:, 0]] = 1

# 创建标签
labels = np.full(num_users, -1)
labels[known_relations_indices[:, 0]] = 1
labels[known_relations_indices[:, 1]] = 1

# 划分训练集和测试集
mask_train = np.random.rand(num_users) < 0.8
X_train = relations[mask_train]
y_train = labels[mask_train]
X_test = relations[~mask_train]
y_test = labels[~mask_train]

# 使用 LabelPropagation 进行半监督学习
label_propagation = LabelPropagation(kernel='knn', n_neighbors=10)
label_propagation.fit(X_train, y_train)

# 预测并评估
y_pred_propagation = label_propagation.predict(X_test)
accuracy_propagation = accuracy_score(y_test, y_pred_propagation)
print(f'Accuracy (LabelPropagation): {accuracy_propagation}')

# 使用 LabelSpreading 进行半监督学习
label_spreading = LabelSpreading(kernel='knn', n_neighbors=10)
label_spreading.fit(X_train, y_train)

# 预测并评估
y_pred_spreading = label_spreading.predict(X_test)
accuracy_spreading = accuracy_score(y_test, y_pred_spreading)
print(f'Accuracy (LabelSpreading): {accuracy_spreading}')


In [None]:
import numpy as np
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt

# 加载 CIFAR-10 数据集
cifar_data = fetch_openml(name='CIFAR_10', version=1, cache=False)

# 获取图像和标签
images = np.array(cifar_data['data'], dtype='uint8')
labels = np.array(cifar_data['target'], dtype='int')

# 划分数据集为有标签和无标签
rng = np.random.RandomState(42)
mask_labeled = rng.rand(len(labels)) < 0.1  # 仅使用 10% 的数据有标签
labels_unlabeled = -1 * np.ones_like(labels)  # -1 表示未标签
labels_unlabeled[mask_labeled] = labels[mask_labeled]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(images, labels_unlabeled, test_size=0.2, random_state=42)

# 创建基础分类器（随机森林）
base_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# 创建 SelfTrainingClassifier
self_training_clf = SelfTrainingClassifier(base_classifier)

# 使用半监督学习进行训练
self_training_clf.fit(X_train, y_train)

# 预测并评估
y_pred = self_training_clf.predict(X_test)
accuracy = accuracy_score(y_test[y_test != -1], y_pred[y_test != -1])
print(f'Accuracy: {accuracy}')

# 随机选择一些图像进行可视化
num_visualize = 5
visualize_indices = rng.choice(len(X_test), size=num_visualize, replace=False)

# 可视化结果
plt.figure(figsize=(15, 5))
for i, index in enumerate(visualize_indices):
    plt.subplot(1, num_visualize, i + 1)
    plt.imshow(X_test[index].reshape(32, 32, 3), cmap='gray')
    plt.title(f'Predicted: {y_pred[index]}')
    plt.axis('off')

plt.show()


In [None]:
【例6.1】轮廓系数计算案例代码。

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

X, y_true = make_blobs(n_samples=300, centers=4,
                       cluster_std=0.60, random_state=0)
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(X)
print(kmeans.cluster_centers_)
print(kmeans.get_feature_names_out())
print(kmeans.labels_,len(kmeans.labels_))
score = silhouette_score(X, kmeans.labels_)
print(score)
from sklearn.metrics import silhouette_score, adjusted_rand_score

# 使用轮廓系数评估聚类效果
silhouette = silhouette_score(X, kmeans.labels_)
print(f"Silhouette Score: {silhouette}")

# 使用调整兰德指数评估聚类效果
ari = adjusted_rand_score(y_true, kmeans.labels_)
print(f"Adjusted Rand Index: {ari}")

from sklearn.metrics import calinski_harabasz_score

calinski_harabasz = calinski_harabasz_score(X, kmeans.labels_)
print(f"Calinski-Harabasz Index: {calinski_harabasz}")

from sklearn.metrics import davies_bouldin_score

davies_bouldin = davies_bouldin_score(X, kmeans.labels_)
print(f"Davies-Bouldin Index: {davies_bouldin}")


## 【例7.1】SelfTrainingClassifie案例代码。

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# 创建二分类数据集
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=42)

# 将部分数据标签设为-1表示未标记数据
y[800:] = -1

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 绘制原始数据散点图
plt.figure(figsize=(8, 6))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='viridis', s=50)
plt.title("Original Data Distribution")
plt.show()

# 使用SelfTrainingClassifier进行半监督学习
stc = SelfTrainingClassifier(DecisionTreeClassifier(), max_iter=10)
stc.fit(X_train, y_train)
y_pred = stc.predict(X_test)
acc = accuracy_score(y_test[y_test != -1], y_pred[y_test != -1])
print("Accuracy:", acc)

# 绘制半监督学习效果散点图
plt.figure(figsize=(8, 6))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='viridis', s=50)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred, cmap='viridis', s=50, marker='x')
plt.title("SelfTrainingClassifier Data Distribution")
plt.show()


In [None]:
# 创建二分类数据集
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=42)
y[800:] = -1  # 将后面部分的标签设置为-1表示未标记数据


In [None]:
# 绘制原始数据散点图
plt.figure(figsize=(8, 6))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='viridis', s=50)
plt.title("Original Data Distribution")
plt.show()


In [None]:
# 使用SelfTrainingClassifier进行半监督学习
stc = SelfTrainingClassifier(DecisionTreeClassifier(), max_iter=10)
stc.fit(X_train, y_train)
y_pred = stc.predict(X_test)


In [None]:
# 评估半监督学习效果
acc = accuracy_score(y_test[y_test != -1], y_pred[y_test != -1])
print("Accuracy:", acc)


In [None]:
# 绘制半监督学习效果散点图
plt.figure(figsize=(8, 6))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='viridis', s=50)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred, cmap='viridis', s=50, marker='x')
plt.title("SelfTrainingClassifier Data Distribution")
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 对测试集进行预测
y_pred = stc.predict(X_test)

# 移除未标记样本的真实标签和预测标签
y_true_labeled = y_test[y_test != -1]
y_pred_labeled = y_pred[y_test != -1]

# 计算准确率、精确率、召回率和F1分数
accuracy = accuracy_score(y_true_labeled, y_pred_labeled)
precision = precision_score(y_true_labeled, y_pred_labeled, average='binary', pos_label=1)
recall = recall_score(y_true_labeled, y_pred_labeled, average='binary', pos_label=1)
f1 = f1_score(y_true_labeled, y_pred_labeled, average='binary', pos_label=1)

# 打印评估结果
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


## 【例7.2】SemiSupervisedClassifier案例代码。

In [None]:

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 创建一个二分类数据集
X, y = make_classification(n_samples=1000, n_classes=2, n_informative=5, n_redundant=0, random_state=42)

# 将大部分数据标签设为 -1 表示未标记数据
y[900:] = -1

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定义基分类器
base_clf = LogisticRegression(random_state=42)

# 使用 SelfTrainingClassifier 进行半监督学习
semi_clf = SelfTrainingClassifier(base_clf, threshold=0.8)
semi_clf.fit(X_train, y_train)

# 在测试集上进行预测并评估准确率
y_pred = semi_clf.predict(X_test)
acc = accuracy_score(y_test[y_test != -1], y_pred[y_test != -1])
print("SemiSupervisedClassifier Accuracy:", acc)

# 绘制二维散点图，观察半监督学习效果
fig, ax = plt.subplots()
y_combined = np.hstack((y_train, y_test))
scatter = ax.scatter(X[:, 0], X[:, 1], c=y_combined, s=50, cmap='viridis')
legend = ax.legend(*scatter.legend_elements(), loc='lower right')
ax.set_title('SelfTrainingClassifier Data Distribution')
ax.add_artist(legend)
plt.show()


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score, classification_report

# 假设有一个包含已标记和未标记垃圾邮件数据的数据集
# 这里只是一个简单的示例，实际情况下数据集可能更大
data = {
    'text': ['Buy our amazing products!', 'Earn money fast!', 'Important business proposal'],
    'label': [1, 1, None]  # 1表示垃圾邮件，None表示未标记
}

df = pd.DataFrame(data)

# 分割数据集为已标记和未标记
labeled_data = df.dropna()
unlabeled_data = df[df['label'].isnull()]

# 特征提取
vectorizer = CountVectorizer()
X_labeled = vectorizer.fit_transform(labeled_data['text'])
X_unlabeled = vectorizer.transform(unlabeled_data['text'])

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_labeled, labeled_data['label'], test_size=0.2, random_state=42)

# 构建分类器
classifier = SelfTrainingClassifier(MultinomialNB(), criterion='k_best', k_best=50)
classifier.fit(X_train, y_train)

# 对未标记数据进行预测
y_unlabeled_pred = classifier.predict(X_unlabeled)

# 将预测结果添加到原始数据框中
unlabeled_data['label'] = y_unlabeled_pred

# 将已标记和新标记的数据合并
final_data = pd.concat([labeled_data, unlabeled_data])

# 输出最终的数据集
print(final_data)

# 在测试集上评估性能
y_test_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on test set: {accuracy}')
print(classification_report(y_test, y_test_pred))


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score, classification_report

# 扩大数据集到10条记录
data = {
    'text': [
        'Buy our amazing products!', 'Earn money fast!', 'Important business proposal',
        'Exclusive offer just for you!', 'Meet singles in your area!', 'Urgent message: Claim your prize!',
        'Limited time discount!', 'You have won a lottery!', 'Get a free trial now!',
        'Best investment opportunity!'
    ],
    'label': [1, 1, None, 1, 1, None, 1, 1, None, 1]
}

df = pd.DataFrame(data)
df

In [None]:
# 分割数据集为已标记和未标记
labeled_data = df.dropna()
unlabeled_data = df[df['label'].isnull()]

# 特征提取
vectorizer = CountVectorizer()
X_labeled = vectorizer.fit_transform(labeled_data['text'])
X_unlabeled = vectorizer.transform(unlabeled_data['text'])

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_labeled, labeled_data['label'], test_size=0.2, random_state=42)

# 构建分类器
classifier = SelfTrainingClassifier(MultinomialNB(), criterion='k_best', k_best=50)
classifier.fit(X_train, y_train)

# 对未标记数据进行预测
y_unlabeled_pred = classifier.predict(X_unlabeled)

# 将预测结果添加到原始数据框中
unlabeled_data['label'] = y_unlabeled_pred

# 将已标记和新标记的数据合并
final_data = pd.concat([labeled_data, unlabeled_data])

# 输出最终的数据集
print(final_data)

# 在测试集上评估性能
y_test_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on test set: {accuracy}')
print(classification_report(y_test, y_test_pred))

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score, classification_report

# 修改数据集
data = {
    'text': [
        'This product is amazing!', 'Earn extra income easily', 'Important project proposal',
        'Exclusive offer for valued customers', 'Meet interesting people in your area',
        'You are a lucky winner of our contest!', 'Limited time special discount',
        'You have been selected for a prize', 'Try our premium service for free',
        'Secure investment opportunities for your future'
    ],
    'label': [1, 1, None, 1, 1, None, 1, 1, None, 1]
}

df = pd.DataFrame(data)

# 分割数据集为已标记和未标记
labeled_data = df.dropna()
unlabeled_data = df[df['label'].isnull()]

# 特征提取
vectorizer = CountVectorizer()
X_labeled = vectorizer.fit_transform(labeled_data['text'])
X_unlabeled = vectorizer.transform(unlabeled_data['text'])

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_labeled, labeled_data['label'], test_size=0.2, random_state=42)

# 构建分类器
classifier = SelfTrainingClassifier(MultinomialNB(), criterion='k_best', k_best=50)
classifier.fit(X_train, y_train)

# 对未标记数据进行预测
y_unlabeled_pred = classifier.predict(X_unlabeled)

# 将预测结果添加到原始数据框中
unlabeled_data['label'] = y_unlabeled_pred

# 将已标记和新标记的数据合并
final_data = pd.concat([labeled_data, unlabeled_data])

# 输出最终的数据集
print(final_data)

# 在测试集上评估性能
y_test_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on test set: {accuracy}')
print(classification_report(y_test, y_test_pred))


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score, classification_report

# 修改数据集标签为0, 1, None
data = {
    'text': [
        'This product is amazing!', 'Earn extra income easily', 'Important project proposal',
        'Exclusive offer for valued customers', 'Meet interesting people in your area',
        'You are a lucky winner of our contest!', 'Limited time special discount',
        'You have been selected for a prize', 'Try our premium service for free',
        'Secure investment opportunities for your future'
    ],
    'label': [1, 1, None, 1, 1, None, 1, 1, None, 1]
}

# 调整标签为0, 1, None
df = pd.DataFrame(data)
df['label'] = df['label'].map({1: 1, None: None})

# 分割数据集为已标记和未标记
labeled_data = df.dropna()
unlabeled_data = df[df['label'].isnull()]

# 特征提取
vectorizer = CountVectorizer()
X_labeled = vectorizer.fit_transform(labeled_data['text'])
X_unlabeled = vectorizer.transform(unlabeled_data['text'])

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_labeled, labeled_data['label'], test_size=0.2, random_state=42)

# 构建分类器
classifier = SelfTrainingClassifier(MultinomialNB(), criterion='k_best', k_best=50)
classifier.fit(X_train, y_train)

# 对未标记数据进行预测
y_unlabeled_pred = classifier.predict(X_unlabeled)

# 将预测结果添加到原始数据框中
unlabeled_data['label'] = y_unlabeled_pred

# 将已标记和新标记的数据合并
final_data = pd.concat([labeled_data, unlabeled_data])

# 输出最终的数据集
print(final_data)

# 在测试集上评估性能
y_test_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on test set: {accuracy}')
print(classification_report(y_test, y_test_pred))
