In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import numpy as np
from sklearn.decomposition import PCA

def read_and_process_files(folder_path):
    # Create an empty DataFrame to collect the tag data
    labels_df = pd.DataFrame()
    
    # Iterates through all files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            # Constructing complete file paths
            file_path = os.path.join(folder_path, filename)  # 确保这里正确地构建了文件路径
            
            # Read the contents of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                contents = file.read()
                
                # parse a label
                labels = contents.strip().split(',')
                
                # Update DataFrame
                for label in labels:
                    if label in labels_df.columns:
                        labels_df.loc[filename, label] = 1
                    else:
                        labels_df[label] = 0
                        labels_df.loc[filename, label] = 1
    
    # Fill NaN value to 0
    labels_df.fillna(0, inplace=True)
    return labels_df

# Setting the folder path
folder_path = 'I:/synthetic_data_of_queti/tag_sta'
# Calling a function to read data
labels_df = read_and_process_files(folder_path)

# Calculating the Pearson correlation coefficient
correlation_matrix = labels_df.corr()

# Get the upper triangular part of the correlation coefficient matrix, excluding the diagonals
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Convert the values in the upper triangular matrix into a one-dimensional array and remove the NaN values
correlation_coefficients = upper_tri.stack().values

# Visualization of the correlation coefficient matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='viridis', cbar=True)
plt.title('Correlation Matrix')
plt.show()

# Plotting histograms and fitting normal distributions
plt.figure(figsize=(10, 6))
sns.histplot(correlation_coefficients, kde=True, stat="density", linewidth=0.5)
mu, std = stats.norm.fit(correlation_coefficients)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)
plt.title(title)
plt.xlabel('Correlation Coefficient')
plt.ylabel('Density')
plt.show()

# Plotting Q-Q diagrams
plt.figure(figsize=(10, 6))
stats.probplot(correlation_coefficients, dist="norm", plot=plt)
plt.title('Q-Q Plot of Pearson Correlation Coefficients')
plt.show()

# Calculating correlation coefficients beyond 1.28 standard deviations
threshold = mu + 1.28 * std
outliers = np.where((correlation_coefficients > threshold) | (correlation_coefficients < -threshold))[0]
outlier_values = correlation_coefficients[outliers]

# Visualizing outliers
plt.figure(figsize=(10, 6))
sns.histplot(outlier_values, kde=True, stat="density", linewidth=0)
plt.title('Outliers of Pearson Correlation Coefficients')
plt.xlabel('Outlier Correlation Coefficient')
plt.ylabel('Density')
plt.show()

# Fill NaN value to 0 for principal component analysis
correlation_matrix_filled = correlation_matrix.fillna(0)

# Principal component analysis of the correlation coefficient matrix
pca = PCA()
pca.fit(correlation_matrix_filled)

# Get principal components, eigenvalues and eigenvectors
principal_components = pca.components_
explained_variance = pca.explained_variance_
explained_variance_ratio = pca.explained_variance_ratio_

# Get Tags
labels = correlation_matrix.columns

# Visualization of eigenvalues
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance) + 1), explained_variance)
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')
plt.title('Scree Plot')
plt.show()

# Visualizing Principal Components
plt.figure(figsize=(10, 6))
sns.heatmap(principal_components, annot=True, cmap='viridis', xticklabels=labels, yticklabels=["PC" + str(i+1) for i in range(len(principal_components))])
plt.title('Principal Components')
plt.xlabel('Feature')
plt.ylabel('Principal Component')
plt.show()

# Visualizing Explained Variance
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance) + 1), explained_variance)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance')
plt.title('Explained Variance of Principal Components')
plt.show()

# Visualizing Explained Variance Ratio
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance Ratio of Principal Components')
plt.show()

# Print the combination of features for the first three principal components
top_n = 3
for i in range(top_n):
    pc_labels = [labels[j] for j in np.argsort(-np.abs(principal_components[i]))[:len(labels)]]
    print(f"Top features in Principal Component {i+1}: {pc_labels}")

# 找出异常值对应的标签组合
outlier_combinations = []
for index in outliers:
    row, col = np.unravel_index(index, upper_tri.shape)
    label_pair = f"{labels[row]},{labels[col]}:{correlation_coefficients[index]}"
    outlier_combinations.append(label_pair)

# 打印结果
print("Mean of Pearson Correlation Coefficients:", mu)
print("Standard Deviation of Pearson Correlation Coefficients:", std)
print("Outlier Values:", outlier_values)
print("Outlier Combinations:", outlier_combinations)
print("Principal Components:\n", principal_components)
print("Explained Variance:\n", explained_variance)
print("Explained Variance Ratio:\n", explained_variance_ratio)