In [None]:
def readdata(datapath):
    """read data with datetime index
    Args:
       datapath(str):csv file
    Returns:
       rawdata(dataframe)
    """
    rawdata=pd.read_csv(datapath)
    rawdata['time'] = pd.to_datetime(rawdata['datetime'],format="%Y/%m/%d %H:%M")
    rawdata['time']=rawdata['time'].apply(pd.Timestamp)
    rawdata.index=rawdata['time']
    rawdata=rawdata.drop(['datetime','time'], axis=1)
    return rawdata

def data_plot(rawdata):
    """ 1. plot the data in time series and mark the downtimes
    Args:
       rawdata(dataframe):   
    Returns:
       None
    """
    import matplotlib.pyplot as plt
    plt.style.use('ggplot')
    
    column_name=rawdata.columns
    column_num=len(column_name)
    plt.figure(figsize=(50,column_num*10)).patch.set_facecolor('white')
    data_P=rawdata
    
    for idx, i in enumerate(column_name):
        plt.subplot(column_num,1,idx+1)
        plt.title(i,fontsize=12)
        plt.scatter(rawdata.index,rawdata[i])
        plt.title(i,fontsize=70,alpha=0.5)
        plt.ylabel(i,fontsize=50)
        plt.yticks(fontsize=40)
        plt.xticks(fontsize=40,rotation=45)
        plt.legend(fontsize=40,loc='upper left')
        
        #mark the downtime
        plt.axvspan("2017-03-18 13:05:00","2017-03-18 18:05:00",alpha=0.3,color="g")
        plt.axvspan("2017-10-27 08:55:00","2017-10-28 00:10:00",alpha=0.3,color="g")
    
    plt.tight_layout()
    plt.savefig("pca_analysis.png")

def kmeans_PCA(data,PCA_n=2,Kmeans_n):
    """ 1. The silhouette plot for the various clusters on 2D PCA
        2. The visualization of the clustered data on 2D PCA
        3. plot the data with PCA label in time series to mark abnormal conditions and mark the downtimes
        4. plot the different PC in time series and mark the downtimes
    Args:
       rawdata(dataframe)
       PCA_n(int): PCA dimensions(2)
       Kmeans_n(int):the numbers of cluster
       
    Returns:
       cluster_labels(numpy):cluster_labels in time series to mark diffent equipment conditions
    """
    from sklearn import preprocessing
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    from sklearn.datasets import make_blobs
    from sklearn.cluster import KMeans
    from sklearn import datasets
    from sklearn.metrics import silhouette_samples, silhouette_score
    from sklearn.decomposition import PCA 
    %matplotlib inline
    plt.style.use('ggplot')
    
    x = data
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)

    pca=PCA(n_components=PCA_n)
    X=pca.fit_transform(df)
    range_n_clusters = [Kmeans_n]
    
    # 計算並繪製輪廓分析的結果
    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                    c=colors, edgecolor='k')

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                    c="white", alpha=1, s=200, edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                        s=50, edgecolor='k')

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=14, fontweight='bold')
        plt.savefig("pca_clusters.png")
        
    label=pd.DataFrame(cluster_labels,columns=['Equipment_Conditon'],index=data.index)
    x=pd.DataFrame(X,index=data.index)
    data=pd.concat([x,label],axis=1)
    data_plot(label)
    plt.savefig("Equipment_condition.png")
    data_plot(x)
    plt.savefig("pca_comps.png")
    plt.show()
    return cluster_labels

def PCA_FeatruenImportantancePlot(dataset,n_comps):
    """ plot the feature importantance in PCA analysis
    Args:
       dataset(dataframe)
       n_comps(int): PCA dimensions
    Returns:
       None
    """
    from sklearn.decomposition import PCA
    from sklearn import preprocessing
    x = dataset
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)
    pca = PCA(n_components=n_comps, svd_solver='full')
    pca.fit(df)
    print("Variance Ratio --  {} principle components".format(n_comps)) 
    print(pca.explained_variance_ratio_)
    pca_ComDistr = pd.DataFrame(pca.components_, columns = dataset.columns)
    Feature_Importance=pd.DataFrame()
    for i in pca_ComDistr:
        importance=[]
        for j in range(pca_ComDistr.shape[0]):
            a=abs(pca_ComDistr.loc[j,i])*pca.explained_variance_ratio_[j]
            importance.append(a)
        Feature_Importance.insert(0,i,[sum(importance)])
    Feature_Importance=Feature_Importance.sort_values(by=0, ascending=False, axis=1)
    x=Feature_Importance.columns
    x_num=np.arange(len(x))
    y=Feature_Importance.loc[0,:]
    plt.figure(figsize=(10,10))
    plt.barh(x_num,y)
    plt.yticks(x_num, x)
    plt.title("PCA({} components)_Feature Importance".format(n_comps),fontsize=20,alpha=0.5)
    plt.yticks(fontsize=15)
    plt.ylabel("Feature",fontsize=10)
    plt.xlabel("Importance",fontsize=10)
    plt.tight_layout()
    plt.savefig("PCA({} components)_Feature Importance.png".format(n_comps))
    plt.show()
    
def PcaDistrutionPlot(dataset,PcaNum,pca_n):
    """ plot the feature Distrutions in diffenent PCs
    Args:
       dataset(dataframe)
       PcaNum(int): PCA dimensions
       pca_n(int): PC number
    Returns:
       None
    """
    from sklearn.decomposition import PCA
    from sklearn.manifold import TSNE
    from sklearn import preprocessing
    
    x = dataset
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)
    n_comps=PcaNum
    pca = PCA(n_components=n_comps, svd_solver='full')
    pca.fit(df)

    pca_ComDistr = pd.DataFrame(pca.components_, columns = dataset.columns)
    c={}
    for i in pca_ComDistr:
        if pca_ComDistr.loc[PcaNum-1,i]>0:
            c[i]="salmon"
        else:
            c[i]="teal"
    plt.figure(figsize=(10,10))
    
    abs_pca_ComDistr=abs(pca_ComDistr.loc[pca_n,:])
    pca_ComDistr_sort=pd.DataFrame(abs_pca_ComDistr).T.sort_values(by=pca_n, ascending=False, axis=1)
    x=pca_ComDistr_sort.columns
    x_num=np.arange(len(x))
    y=pca_ComDistr_sort.loc[pca_n,:]
    plt.barh(x_num,
            y,color=[c[r] for r in x])
    plt.yticks(x_num, x)
    plt.title("PCA{}-{}_Feature Distribution".format(PcaNum,pca_n+1),fontsize=20,alpha=0.5)
    plt.yticks(fontsize=15)
    plt.ylabel("Feature",fontsize=10)
    plt.xlabel("Distribution",fontsize=10)
    plt.tight_layout()
    plt.savefig("PCA{}-{}_Feature Distribution.png".format(PcaNum,pca_n+1))
    plt.show()
    
def PCA_analysis(data,PcaNum,Kmeans_n):
    """ 1. PCA&Kmeans analysis report to know the equipment conditons
        2. feature importance in PCA analysis
        3. feature distribution in diffenet PCs
    Args:
       data(dataframe)
       PcaNum(int): the numbers of PCA dimensions
       Kmeans_n(int): the numbers of cluster
    Returns:
       cluster_labels(numpy):cluster_labels in time series to mark diffent equipment conditions
    """
    cluster_labels=kmeans_PCA(data,PcaNum,Keams_n)
    PCA_FeatruenImportantancePlot(data,PcaNum)
    for i in range(PcaNum):
        PcaDistrutionPlot(data,PcaNum,i)
    return cluster_labels