In [1]:
# importing libraries
import pandas as pd
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
#import statsmodels.api as sm
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import KBinsDiscretizer, FunctionTransformer
from sklearn import metrics, preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
# basic loadning
data_dir = "/var/some/data-source"
class_labels = ['Ec','Ei','Er','SBa','SBb','SBc','SBd','Sa','Sb','Sc','Sd','Se']

df = pd.read_csv(data_dir + "/categorized.csv")
include_columns = ["PETROR90_R", "PETROMAG_R", "PETROMAG_U", "PETROMAGERR_U", "PETROMAGERR_R", "PETROMAGERR_Z", "PSFMAG_R", "DEVMAGERR_R", 
                   "FRACDEV_R", "EXTINCTION_R", "ROWC_R", "COLC_R", "CMODELMAGERR_R", "REDSHIFT", "REDSHIFTERR", "REDSHIFT_SIMPLE_BIN", "PETROMAG_MR", 
                   "PETROMAGERR_MU", "PETROMAGERR_MR", "PETROMAGERR_MZ", "PETROR50_R_KPC_SIMPLE_BIN", "WVT_BIN"]
label_column = ["code"]
df = df[include_columns + label_column]

In [3]:
def handle_limits_by_groups(dframe, group_column, limit_values = {}):
    distinct = set(dframe[group_column].unique())
    
    for v in distinct:
        for col in limit_values:
            # get data indexes of limit column values per class
            inx = dframe[(dframe["code"] == v) & (dframe[col] == limit_values[col])].index
            # clean above index values to obtain the mean value 
            tmp_df = dframe.drop(inx)
            # set the mean value in the original frame for the identified index rows
            dframe.loc[inx, col] = tmp_df[col].mean()

    print("Limit adjustment completed")

def handle_outliers_by_groups(dframe, group_column, omit_columns):
    distinct = set(dframe[group_column].unique())
    
    for v in distinct:
        tmp_df = dframe[dframe["code"] == v]
        tmp_df = tmp_df.drop(columns=omit_columns)
        q1 = tmp_df.quantile(0.25)
        q3 = tmp_df.quantile(0.75)
        irq = q3 - q1
        lower_limit = q1 - (1.5 * irq)
        upper_limit = q3 + (1.5 * irq)
        
        # adjust the outlier value for each column
        for col in tmp_df.columns:
            tmp_df[col].astype('float64').dtypes
            dframe[col].astype('float64').dtypes
            tmp_df.loc[tmp_df[col] < lower_limit[col], col] = lower_limit[col]
            tmp_df.loc[tmp_df[col] > upper_limit[col], col] = upper_limit[col]
            dframe.loc[tmp_df.index, col] = tmp_df[col]

In [4]:
# handling min/max limits
limit_values = {"REDSHIFT_SIMPLE_BIN": 999, "PETROMAGERR_MU": 99999, "PETROMAGERR_MZ": 99999, "PETROR50_R_KPC_SIMPLE_BIN": 999}
handle_limits_by_groups(df, "code", limit_values)

# call and pass the dataset to handle outliers in the numerical columns    
handle_outliers_by_groups(df, "code", ["code"])

Limit adjustment completed


In [13]:
df = shuffle(df, random_state=42)
x_transformed = df[include_columns].copy()

std_scaler = preprocessing.StandardScaler()
scaled_values = std_scaler.fit_transform(x_transformed)
scaled_df = pd.DataFrame(scaled_values, columns=list(x_transformed.columns))
scaled_df.head(5)

Unnamed: 0,PETROR90_R,PETROMAG_R,PETROMAG_U,PETROMAGERR_U,PETROMAGERR_R,PETROMAGERR_Z,PSFMAG_R,DEVMAGERR_R,FRACDEV_R,EXTINCTION_R,...,CMODELMAGERR_R,REDSHIFT,REDSHIFTERR,REDSHIFT_SIMPLE_BIN,PETROMAG_MR,PETROMAGERR_MU,PETROMAGERR_MR,PETROMAGERR_MZ,PETROR50_R_KPC_SIMPLE_BIN,WVT_BIN
0,-0.349052,0.898176,0.015079,-0.009445,-0.336718,0.898775,1.355446,1.298422,0.522898,-0.930417,...,1.547509,-0.449955,0.598195,-0.362203,0.856324,-0.018554,-0.351792,0.873234,0.172139,0.943152
1,2.154686,-0.585153,0.600913,2.071444,0.30587,0.042899,-0.154116,-0.57781,0.907868,-0.314822,...,-0.247034,1.996847,1.690042,2.695954,-2.099228,2.093059,0.233084,0.020455,2.65315,-1.385832
2,-0.058494,0.770113,1.667843,3.07785,0.757149,-0.219062,0.24113,0.220722,0.907868,-0.654375,...,0.61356,2.012272,0.418802,3.674564,-1.317395,3.087429,0.507068,-0.240627,1.324932,-1.385832
3,1.807467,-0.967228,-0.776244,-0.295254,1.805542,0.042262,-1.362582,-0.940028,0.907868,-1.074928,...,-0.637405,0.892494,0.07965,0.86106,-1.517603,-0.305183,1.576426,0.019815,1.525418,0.532155
4,-1.475226,0.993973,1.103378,-0.029839,-0.455181,-0.738046,-0.421742,0.156467,0.755862,0.55812,...,0.368336,-0.192269,0.504127,-0.11755,0.524563,-0.038874,-0.444518,-0.737067,-1.18114,-0.666587


In [18]:
# Uniform strategy is used to form 1000 bins
# Notice x-axis values were transformed to positive and y-axis relatively scaled

kbins = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform', subsample=None)
x_discrete = kbins.fit_transform(scaled_df)
x_discrete = pd.DataFrame(x_discrete, columns=list(x_transformed.columns))
x_discrete

Unnamed: 0,PETROR90_R,PETROMAG_R,PETROMAG_U,PETROMAGERR_U,PETROMAGERR_R,PETROMAGERR_Z,PSFMAG_R,DEVMAGERR_R,FRACDEV_R,EXTINCTION_R,...,CMODELMAGERR_R,REDSHIFT,REDSHIFTERR,REDSHIFT_SIMPLE_BIN,PETROMAG_MR,PETROMAGERR_MU,PETROMAGERR_MR,PETROMAGERR_MZ,PETROR50_R_KPC_SIMPLE_BIN,WVT_BIN
0,23.0,73.0,48.0,34.0,12.0,27.0,63.0,45.0,85.0,18.0,...,48.0,24.0,60.0,23.0,49.0,31.0,6.0,23.0,43.0,56.0
1,64.0,44.0,58.0,78.0,20.0,18.0,42.0,21.0,99.0,32.0,...,25.0,68.0,74.0,81.0,8.0,77.0,14.0,14.0,92.0,0.0
2,28.0,71.0,77.0,99.0,26.0,15.0,48.0,31.0,99.0,24.0,...,36.0,68.0,58.0,99.0,19.0,99.0,17.0,11.0,66.0,0.0
3,58.0,37.0,34.0,28.0,40.0,18.0,26.0,16.0,99.0,14.0,...,21.0,48.0,53.0,46.0,16.0,25.0,31.0,14.0,70.0,46.0
4,5.0,75.0,67.0,33.0,10.0,10.0,39.0,31.0,94.0,51.0,...,33.0,29.0,59.0,27.0,44.0,31.0,5.0,6.0,16.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239095,24.0,72.0,53.0,37.0,11.0,28.0,74.0,64.0,0.0,15.0,...,60.0,52.0,43.0,51.0,28.0,34.0,5.0,25.0,77.0,33.0
239096,17.0,80.0,56.0,39.0,16.0,23.0,60.0,45.0,33.0,96.0,...,33.0,59.0,66.0,55.0,24.0,36.0,10.0,19.0,63.0,17.0
239097,24.0,42.0,47.0,27.0,6.0,6.0,26.0,18.0,99.0,97.0,...,22.0,14.0,59.0,13.0,45.0,24.0,2.0,3.0,16.0,23.0
239098,42.0,48.0,30.0,13.0,8.0,12.0,63.0,27.0,85.0,54.0,...,28.0,11.0,65.0,9.0,54.0,10.0,3.0,8.0,26.0,28.0


In [19]:
# split the data into train/test sets
train_X, test_X, train_y, test_y = train_test_split(x_discrete[include_columns], df["code"], test_size=0.2, random_state=42)
test_X

Unnamed: 0,PETROR90_R,PETROMAG_R,PETROMAG_U,PETROMAGERR_U,PETROMAGERR_R,PETROMAGERR_Z,PSFMAG_R,DEVMAGERR_R,FRACDEV_R,EXTINCTION_R,...,CMODELMAGERR_R,REDSHIFT,REDSHIFTERR,REDSHIFT_SIMPLE_BIN,PETROMAG_MR,PETROMAGERR_MU,PETROMAGERR_MR,PETROMAGERR_MZ,PETROR50_R_KPC_SIMPLE_BIN,WVT_BIN
31833,19.0,63.0,48.0,17.0,21.0,14.0,38.0,34.0,68.0,32.0,...,28.0,12.0,34.0,9.0,59.0,14.0,14.0,11.0,14.0,21.0
10542,39.0,64.0,50.0,40.0,15.0,27.0,52.0,36.0,99.0,20.0,...,41.0,59.0,52.0,55.0,21.0,37.0,9.0,24.0,93.0,30.0
201211,10.0,81.0,76.0,69.0,26.0,17.0,49.0,35.0,86.0,97.0,...,35.0,40.0,69.0,37.0,36.0,68.0,19.0,13.0,27.0,3.0
34881,45.0,54.0,52.0,36.0,15.0,12.0,41.0,23.0,99.0,41.0,...,27.0,48.0,56.0,46.0,22.0,34.0,9.0,9.0,65.0,40.0
198984,8.0,76.0,67.0,32.0,10.0,9.0,45.0,35.0,97.0,40.0,...,39.0,23.0,61.0,23.0,50.0,29.0,4.0,6.0,14.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125381,27.0,65.0,53.0,36.0,15.0,19.0,47.0,37.0,46.0,24.0,...,27.0,29.0,42.0,27.0,40.0,34.0,9.0,15.0,41.0,57.0
176753,20.0,55.0,55.0,45.0,10.0,13.0,30.0,23.0,99.0,29.0,...,27.0,26.0,55.0,23.0,39.0,42.0,4.0,9.0,26.0,31.0
203876,23.0,55.0,54.0,33.0,7.0,9.0,35.0,20.0,99.0,57.0,...,25.0,46.0,48.0,41.0,23.0,30.0,3.0,5.0,45.0,37.0
71972,41.0,53.0,43.0,31.0,11.0,15.0,49.0,69.0,17.0,28.0,...,62.0,4.0,66.0,4.0,72.0,28.0,5.0,11.0,14.0,4.0


In [26]:
# preparing for deep learning
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, AveragePooling1D

train_label = pd.get_dummies(train_y).values
test_label = pd.get_dummies(test_y).values

no_of_features = train_X.shape[1]
no_of_labels = len(set(train_y))

In [28]:
model = Sequential()

model.add(Conv1D(128, kernel_size=2, activation='relu', input_shape=(no_of_features,1)))
model.add(AveragePooling1D(pool_size=2))

model.add(Conv1D(256, kernel_size=2, activation='relu', input_shape=(no_of_features,)))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(512, kernel_size=2, activation='relu', input_shape=(no_of_features,)))
model.add(MaxPooling1D(pool_size=3))
model.add(Flatten())

model.add(Dense(1024, activation='relu'))
model.add(Dense(no_of_labels, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(train_X, train_label, epochs=10, batch_size=32, validation_data=(test_X, test_label))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#predictions
predicted_y = model.predict(test_X)
predicted_y