In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA,  FastICA
from sklearn.random_projection import GaussianRandomProjection,SparseRandomProjection
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score

from collections import Counter, defaultdict

from datetime import datetime

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', 'Solver terminated early.*')
warnings.filterwarnings("ignore")

print('imports completed')

In [None]:
np.random.seed(10)
print('random seed set to 10 completed')

In [None]:
# Compute accuracy of cluster
def my_accuracy_cluster(labelIn,labelOut):
    
    # create an empty array with shape to match the incoming label
    myprediction = np.empty_like(labelIn)
    
    # Get the key of the dictionary returned most common label 
    for l in set(labelOut):
        mask = labelOut == l
        target = Counter(labelIn[mask]).most_common(1)[0][0]
        myprediction[mask] = target  
    return accuracy_score(labelIn,myprediction)

# Plot the curve # clusters vs score
def plot_score_curve(datadictionary,mytitle) :
    fig=plt.figure()
    
    num_clusters = list(datadictionary.keys())
    score_clusters = list(datadictionary.values())
    
    ax = fig.add_subplot(111,xlabel='# Clusters',ylabel='Score',title=mytitle)
    ax.plot(num_clusters, score_clusters, 'o-', color="b",
             label="Num of Clusters")
    ax.set_xticks(num_clusters)
    ax.legend(loc="best")
    fig.savefig(mytitle+".png")
    plt.close(fig)
    return plt

# Plot the curve # clusters vs time
def plot_time_curve(datadictionary,mytitle) :
    fig=plt.figure()
    
    num_clusters = list(datadictionary.keys())
    time_clusters = list(datadictionary.values())
    
    ax = fig.add_subplot(111,xlabel='# Clusters',ylabel='Time',title=mytitle)
    ax.plot(num_clusters, time_clusters, 'o-', color="b",
             label="Num of Clusters")
    ax.set_xticks(num_clusters)
    ax.legend(loc="best")
    fig.savefig(mytitle+".png")
    plt.close(fig)
    return plt

def plot_score_feature_transform(datadictionary,mytitle) :
    fig=plt.figure()
    
    PCA_components = list(datadictionary.keys())
    ax = fig.add_subplot(111,xlabel='# Clusters',ylabel='Score',title=mytitle)
    
    colors = ['b','g','r','c','m','y','k','w']
       
    for PCA_comp in range(len(PCA_components)):
        num_clusters = list(datadictionary[PCA_components[PCA_comp]].keys())
        score_clusters = list(datadictionary[PCA_components[PCA_comp]].values())
        ax.plot(num_clusters, score_clusters, 'o-', color=colors[PCA_comp],
                 label=str(PCA_components[PCA_comp]))
        
    ax.set_xticks(num_clusters)
    ax.legend(loc="best")
    fig.savefig(mytitle+".png") 
    plt.close(fig)
    return plt
    
def plot_time_feature_transform(datadictionary,mytitle) :
    fig=plt.figure()
    
    PCA_components = list(datadictionary.keys())
    ax = fig.add_subplot(111,xlabel='# Clusters',ylabel='Time',title=mytitle)
    
    colors = ['b','g','r','c','m','y','k','w']
       
    for PCA_comp in range(len(PCA_components)):
        num_clusters = list(datadictionary[PCA_components[PCA_comp]].keys())
        time_clusters = list(datadictionary[PCA_components[PCA_comp]].values())
        ax.plot(num_clusters, time_clusters, 'o-', color=colors[PCA_comp],
                 label=str(PCA_components[PCA_comp]))
        
    ax.set_xticks(num_clusters)
    ax.legend(loc="best")
    fig.savefig(mytitle+".png")
    plt.close(fig)
    return plt

def plot_dataIn2(myTitle,df_orig_labels,df_data,figName,component):
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel(component +' 1', fontsize = 15)
    ax.set_ylabel(component +' 2', fontsize = 15)
    ax.set_title(myTitle, fontsize = 20)
    targets = list(set(df_orig_labels))
    colors = ['r', 'g', 'b','c','m','y','k']
    for target, color in zip(targets,colors):
        indicesToKeep = df_orig_labels == target
        ax.scatter(df_data.loc[indicesToKeep,0]
                   , df_data.loc[indicesToKeep,1]
                   , c = color
                   , s = 50)
    ax.legend(targets)
    ax.grid()
    fig.savefig(figName +".png") 

print('definitions completed')

In [None]:
# load asteroid data
df_orig = pd.read_csv("../input/asteroid-dataset/dataset.csv")

df_count = df_orig['class'].value_counts().sort_index()

df_count.index
print('dataset load completed')

In [None]:
# Plot the data
df_count = df_orig[['class','diameter']].groupby(['class'],as_index=False)
df_count = df_count.count()
df_count.rename(columns={'diameter':'count'},inplace=True)

fig = plt.figure(figsize =(15,10))
ax = fig.add_subplot(111,xlabel='class',ylabel='count',title='Asteroid Data')
ax.bar(df_count['class'],df_count['count'])
ax.set_xticks(df_count['class'])
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
             ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize(20)
fig.savefig('asteroid_grouped_data.png')
plt.close(fig) 

print("There are " + str(df_orig.shape[0]) + " rows of data")

In [None]:
df = None
df_x = None
df_y = None
to_encode = []

df_data = pd.read_csv('../input/asteroid-dataset/dataset.csv')
    
inputs = df_data

inputs.pha.replace(('Y', 'N'), (1, 0), inplace = True)
inputs['pha'] = inputs['pha'].fillna(0)
inputs['pha'] = inputs.pha.astype(int)

inputs.neo.replace(('Y', 'N'), (1, 0), inplace = True)
inputs['neo'] = inputs['neo'].fillna(0)
inputs['neo'] = inputs.neo.astype(int)

inputs = inputs.drop(['id', 'spkid', 'full_name', 'name', 'prefix', 'orbit_id', 'pdes', 'equinox', 'diameter', 
                      'albedo', 'diameter_sigma'], axis='columns')
inputs.dropna(inplace=True)
target = inputs['class']

df = inputs
inputs1 = inputs.drop(['class'], axis='columns')
df_x = inputs1.columns
df_y = 'class'
to_encode = inputs.columns
        
le = LabelEncoder
encoderDict = defaultdict(le)
for column in to_encode:
    df[column] = df[column].dropna()
    df = df[df[column].notnull()]
    df[column] = encoderDict[column].fit_transform(df[column])

df = df.head(5000)

df = df.dropna()  

df = df.sample(frac=1).reset_index(drop=True)
print('Dataset size: ' + str(df.size))
print('Features: ' + str(df_x))
print('Target Decision: ' + df_y)

# Split the data into attributes and labels

df_orig_attributes = df.loc[:, df_x]
df_orig_labels = df.loc[:, df_y]
print(df_orig_labels)

In [None]:
# Standardize the attributes
scaler=StandardScaler().fit(df_orig_attributes)

df_orig_attributes[df_orig_attributes.columns.difference(['class'])] = scaler.fit_transform(df_orig_attributes[df_orig_attributes.columns.difference(['class'])])

# Clusters
mycluster_df = [2,3,4,5,6,7,10,15,20,25,30,40,50]
print ('standardize and clusters completed')

In [None]:
###############################################################################
# k - means cluster and EM (using gaussian mixture) with no feature selection #
###############################################################################

my_accuracy_kmeans = dict()
my_time_kmeans = dict()

my_accuracy_em = dict()
my_time_em = dict()

for myk in mycluster_df:
    
    # kMeans Clustering
    startTime = datetime.now()        
    myk_mean_prediction = KMeans(n_clusters=myk,random_state=0).fit_predict(df_orig_attributes)        
    myk_mean_accuracy_res = my_accuracy_cluster(df_orig_labels,myk_mean_prediction)        
    endTime = datetime.now()
    
    # append accuracy
    my_accuracy_kmeans[myk] = myk_mean_accuracy_res        
    # append my_time array
    my_time_kmeans[myk] = (endTime-startTime).total_seconds()
    
    # EM with GaussianMixture Clustering
    startTime = datetime.now()        
    my_em_prediction = GaussianMixture(n_components=myk,random_state=0).fit(df_orig_attributes).predict(df_orig_attributes)        
    my_accuracy_em_res = my_accuracy_cluster(df_orig_labels,my_em_prediction)        
    endTime = datetime.now()
    
    # append accuracy
    my_accuracy_em[myk] = my_accuracy_em_res        
    # append my_time array
    my_time_em[myk] = (endTime-startTime).total_seconds()
    
plot_score_curve(my_accuracy_kmeans,"k-means Clusters vs Score")
plot_time_curve(my_time_kmeans,"k-means Clusters vs Time")

plot_score_curve(my_accuracy_em,"EM Clusters vs Score")
plot_time_curve(my_time_em,"EM Clusters vs Time")    
print('k - means Cluster and EM (using gaussian mixture) with no feature selection completed')

In [None]:
###############################################################
# k - means cluster and EM (using gaussian mixture) after PCA #
###############################################################

# for the dataset PCA, we can only have 33 Principal components 
# since the number of features for the dataset is 33
PCA_component_df = [1,10,20,25,30,33]

my_accuracy_kmeans_PCA = defaultdict(dict)
my_time_kmeans_PCA = defaultdict(dict)
my_accuracy_em_PCA = defaultdict(dict)
my_time_em_PCA = defaultdict(dict)

df_data_PCA = PCA(random_state=0)
df_data_eigen  = df_data_PCA.fit(df_orig_attributes)
df_data_eigenvalues = df_data_eigen.explained_variance_

for PCA_comp in PCA_component_df :
    
    df_data_PCA = PCA(n_components=PCA_comp,random_state=0) 
    df_data_PCA_data = df_data_PCA.fit_transform(df_orig_attributes)
    df_data_PCA_df = pd.DataFrame(data = df_data_PCA_data)
    
    df_data_PCA_df_nn = pd.concat([df_data_PCA_df,df_orig_labels],axis=1)
            
    for cluster in mycluster_df:
        
        # kMeans clustering
        startTime = datetime.now()
        myk_mean_PCA_prediction = KMeans(n_clusters=cluster,random_state=0).fit_predict(df_data_PCA_df)        
        myk_mean_accuracy_res = my_accuracy_cluster(df_orig_labels,myk_mean_PCA_prediction)    
        endTime = datetime.now()
        # append accuracy
        my_accuracy_kmeans_PCA[PCA_comp][cluster] = myk_mean_accuracy_res        
        # append my_time array
        my_time_kmeans_PCA[PCA_comp][cluster] = (endTime-startTime).total_seconds()
        
        # EM using GaussianMixture clustering    
        startTime = datetime.now()        
        my_em_prediction = GaussianMixture(n_components=cluster).fit(df_data_PCA_df).predict(df_data_PCA_df)        
        my_accuracy_em_res = my_accuracy_cluster(df_orig_labels,my_em_prediction)        
        endTime = datetime.now()
        
        # append accuracy
        my_accuracy_em_PCA[PCA_comp][cluster] = my_accuracy_em_res        
        # append my_time array
        my_time_em_PCA[PCA_comp][cluster] = (endTime-startTime).total_seconds()

plot_time_feature_transform(my_time_kmeans_PCA,"k-means PCA Clusters vs Time")
plot_score_feature_transform(my_accuracy_kmeans_PCA,"k-means PCA Clusters vs Score")
plot_time_feature_transform(my_time_em_PCA,"EM PCA Clusters vs Time")
plot_score_feature_transform(my_accuracy_em_PCA,"EM PCA Clusters vs Score")

# Data in 2 component PCA
df_data_PCA = PCA(n_components=2,random_state=0) 
df_data_PCA_data = df_data_PCA.fit_transform(df_orig_attributes)
df_data_PCA_df = pd.DataFrame(data = df_data_PCA_data)

plot_dataIn2("2 component PCA", df_orig_labels, df_data_PCA_df, "data_in_2_PCA", "Principal Component")

print('k - means cluster and EM (using gaussian mixture) after PCA completed')

In [None]:
df_data_eigenvalues

In [None]:
###############################################################
# k - means cluster and EM (using gaussian mixture) after ICA #
###############################################################

# for the dataset ICA, we can only have 33 Principal components 
# since the number of features for the dataset is 33
ICA_component_df = [1,10,20,25,30,33]

my_accuracy_kmeans_ICA = defaultdict(dict)
my_time_kmeans_ICA = defaultdict(dict)
my_accuracy_em_ICA = defaultdict(dict)
my_time_em_ICA = defaultdict(dict)

df_data_ICA = FastICA(random_state=0) 
df_data_ICA_data = df_data_ICA.fit_transform(df_orig_attributes)
df_data_ICA_df = pd.DataFrame(data = df_data_ICA_data)
df_data_ICA_kurtosis = df_data_ICA_df.kurt()

for ICA_comp in ICA_component_df :
    
    df_data_ICA = FastICA(n_components=ICA_comp,random_state=0) 
    df_data_ICA_data = df_data_ICA.fit_transform(df_orig_attributes)
    df_data_ICA_df = pd.DataFrame(data = df_data_ICA_data)
    
    for cluster in mycluster_df:
        
        # kMeans Clustering
        startTime = datetime.now()
        myk_mean_ICA_prediction = KMeans(n_clusters=cluster,random_state=0).fit_predict(df_data_ICA_df)        
        myk_mean_accuracy_res = my_accuracy_cluster(df_orig_labels,myk_mean_ICA_prediction)    
        endTime = datetime.now()
        # append accuracy
        my_accuracy_kmeans_ICA[ICA_comp][cluster] = myk_mean_accuracy_res        
        # append my_time array
        my_time_kmeans_ICA[ICA_comp][cluster] = (endTime-startTime).total_seconds()
        
        # EM using GaussianMixture Clustering     
        startTime = datetime.now()        
        my_em_prediction = GaussianMixture(n_components=cluster).fit(df_data_ICA_df).predict(df_data_ICA_df)        
        my_accuracy_em_res = my_accuracy_cluster(df_orig_labels,my_em_prediction)        
        endTime = datetime.now()
        
        # append accuracy
        my_accuracy_em_ICA[ICA_comp][cluster] = my_accuracy_em_res        
        # append my_time array
        my_time_em_ICA[ICA_comp][cluster] = (endTime-startTime).total_seconds()

plot_time_feature_transform(my_time_kmeans_ICA,"k-means ICA Clusters vs Time")
plot_score_feature_transform(my_accuracy_kmeans_ICA,"k-means ICA Clusters vs Score")
plot_time_feature_transform(my_time_em_ICA,"EM ICA Clusters vs Time")
plot_score_feature_transform(my_accuracy_em_ICA,"EM ICA Clusters vs Score")

# to illustrate the data in 2 component ICA
df_data_ICA = FastICA(n_components=2) 
df_data_ICA_data = df_data_ICA.fit_transform(df_orig_attributes)
df_data_ICA_df = pd.DataFrame(data = df_data_ICA_data)

plot_dataIn2('2 component ICA',df_orig_labels, df_data_ICA_df,"data_in_2_ICA", "Component")
  
print('k - means cluster and EM (using gaussian mixture) after ICA completed')

In [None]:
df_data_ICA_kurtosis

In [None]:
##############################################################
# k - means cluster and EM (using gaussian mixture) after RP #
##############################################################
# for the dataset RP, we can only have 33 Principal components 
# since the number of features for the dataset is 33
RP_component_df = [1,10,20,25,30,33]

my_accuracy_kmeans_RP = defaultdict(dict)
my_time_kmeans_RP = defaultdict(dict)
my_accuracy_em_RP = defaultdict(dict)
my_time_em_RP = defaultdict(dict)

df_data_RP = GaussianRandomProjection(random_state=0,n_components=33) 
df_data_RP_data = df_data_RP.fit_transform(df_orig_attributes)
df_data_RP_df = pd.DataFrame(data = df_data_RP_data)
df_data_RP_kurtosis = df_data_RP_df.kurt()

for RP_comp in RP_component_df :
    
    df_data_RP = GaussianRandomProjection(n_components=RP_comp,random_state=0) 
    df_data_RP_data = df_data_RP.fit_transform(df_orig_attributes)
    df_data_RP_df = pd.DataFrame(data = df_data_RP_data)
    
    for cluster in mycluster_df:
        
        # kMeans clustering
        startTime = datetime.now()
        myk_mean_RP_prediction = KMeans(n_clusters=cluster,random_state=0).fit_predict(df_data_RP_df)        
        myk_mean_accuracy_res = my_accuracy_cluster(df_orig_labels,myk_mean_RP_prediction)    
        endTime = datetime.now()
        # append accuracy
        my_accuracy_kmeans_RP[RP_comp][cluster] = myk_mean_accuracy_res        
        # append my_time array
        my_time_kmeans_RP[RP_comp][cluster] = (endTime-startTime).total_seconds()
        
        # EM using GaussianMixture clustering
        startTime = datetime.now()        
        my_em_prediction = GaussianMixture(n_components=cluster).fit(df_data_RP_df).predict(df_data_RP_df)        
        my_accuracy_em_res = my_accuracy_cluster(df_orig_labels,my_em_prediction)        
        endTime = datetime.now()
        
        # append accuracy
        my_accuracy_em_RP[RP_comp][cluster] = my_accuracy_em_res        
        # append my_time array
        my_time_em_RP[RP_comp][cluster] = (endTime-startTime).total_seconds()

plot_time_feature_transform(my_time_kmeans_RP,"k-means RP Clusters vs Time")
plot_score_feature_transform(my_accuracy_kmeans_RP,"k-means RP Clusters vs Score")
plot_time_feature_transform(my_time_em_RP,"EM RP Clusters vs Time")
plot_score_feature_transform(my_accuracy_em_RP,"EM RP Clusters vs Score")

# to illustrate the data in 2 component RP
df_data_RP = GaussianRandomProjection(n_components=2) 
df_data_RP_data = df_data_RP.fit_transform(df_orig_attributes)
df_data_RP_df = pd.DataFrame(data = df_data_RP_data)

plot_dataIn2('2 component RP',df_orig_labels, df_data_RP_df,"data_in_2_RP", "Component")

print('k - means cluster and EM (using gaussian mixture) after RP completed')

In [None]:
###############################################################
# k - means cluster and EM (using gaussian mixture) after RFE #
###############################################################
# for the dataset RFE, we can only have 33 Principal components 
# since the number of features for the dataset is 33
RFE_component_df = [1,10,20,25,30,33]

estimator = SVR(kernel="linear")

my_accuracy_kmeans_RFE = defaultdict(dict)
my_time_kmeans_RFE = defaultdict(dict)
my_accuracy_em_RFE = defaultdict(dict)
my_time_em_RFE = defaultdict(dict)

for RFE_comp in RFE_component_df :
    
    df_data_RFE = RFE(estimator,n_features_to_select=RFE_comp) 
    df_data_RFE_data = df_data_RFE.fit_transform(df_orig_attributes,df_orig_labels)
    df_data_RFE_df = pd.DataFrame(data = df_data_RFE_data)
                    
    for cluster in mycluster_df:
        
        # kMeans clustering
        startTime = datetime.now()
        myk_mean_RFE_prediction = KMeans(n_clusters=cluster,random_state=0).fit_predict(df_data_RFE_df)        
        myk_mean_accuracy_res = my_accuracy_cluster(df_orig_labels,myk_mean_RFE_prediction)    
        endTime = datetime.now()
        # append accuracy
        my_accuracy_kmeans_RFE[RFE_comp][cluster] = myk_mean_accuracy_res        
        # append my_time array
        my_time_kmeans_RFE[RFE_comp][cluster] = (endTime-startTime).total_seconds()
        
        # EM using GaussianMixture clustering    
        startTime = datetime.now()        
        my_em_prediction = GaussianMixture(n_components=cluster).fit(df_data_RFE_df).predict(df_data_RFE_df)        
        my_accuracy_em_res = my_accuracy_cluster(df_orig_labels,my_em_prediction)        
        endTime = datetime.now()
        
        # append accuracy
        my_accuracy_em_RFE[RFE_comp][cluster] = my_accuracy_em_res        
        # append my_time array
        my_time_em_RFE[RFE_comp][cluster] = (endTime-startTime).total_seconds()

plot_time_feature_transform(my_time_kmeans_RFE,"k-means RFE Clusters vs Time")
plot_score_feature_transform(my_accuracy_kmeans_RFE,"k-means RFE Clusters vs Score")
plot_time_feature_transform(my_time_em_RFE,"EM RFE Clusters vs Iime")
plot_score_feature_transform(my_accuracy_em_RFE,"EM RFE Clusters vs Score")

# Data in 2 component RFE
df_data_RFE = RFE(estimator,n_features_to_select=2) 
df_data_RFE_data = df_data_RFE.fit_transform(df_orig_attributes,df_orig_labels)
df_data_RFE_df = pd.DataFrame(data = df_data_RFE_data)

plot_dataIn2('2 component RFE',df_orig_labels, df_data_RFE_df,"data_in_2_RFE", "Component")
  
print('k - means cluster and EM (using gaussian mixture) after RFE completed')

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import glob

for image_path in glob.glob("/kaggle/working/*.png"):
    img = mpimg.imread(image_path)
    plt.ion()
    plt.figure()
    plt.axis('off') 
    plt.imshow(img)
    plt.show()
    plt.close()