In [0]:
!pip install factor-analyzer

In [0]:
!pip install scikit-plot

In [0]:
!pip install nimfa

In [0]:
!nvidia-smi -L

In [0]:
!nvidia-smi

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from scipy import stats
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Activation
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from keras.layers import LeakyReLU
from sklearn import svm
from keras.utils import to_categorical
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
import seaborn as sb
from factor_analyzer.factor_analyzer import calculate_kmo
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from scipy import stats
import time
import nimfa

**Data Standardization**

In [0]:
data_scaler = RobustScaler(with_scaling=True,with_centering= True)
def data_norm(df):
    data_scaler.fit(df)
    df=data_scaler.transform(df)
    return df

**Feature Selection using MRMR**

In [0]:
!pip install pymrmr

In [0]:
import pymrmr

**Fetching Data For Feature Selection**

In [0]:
url = 'https://raw.githubusercontent.com/rohandongare-nci/18120199-Data/master/Feature%20Selection/Feature%20Selection.csv'
sdss_feature_selection = pd.read_csv(url)

**Removing camera related and ID values**

In [0]:
sdss = sdss_1

In [0]:
unwanted_columns = ['objid','specobjid']
sdss.drop(unwanted_columns, axis=1, inplace=True)
sdss.head()

PyMRMR requires the target variable to be the first column in the dataframe

In [0]:
sdss_mrmr=sdss_feature_selection[['class','ra', 'dec', 'u', 'g', 'r', 'i', 'z','redshift', 'plate',
       'mjd', 'fiberid','camcol','run','rerun','field']]

In [0]:
sdss_mrmr.columns.get_loc('class')

In [0]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
enc = LabelEncoder()
sdss_mrmr['class']=enc.fit_transform(sdss_mrmr['class'])

In [0]:
sdss_mrmr[['class']]=to_categorical(sdss_mrmr[['class']])

**MIQ MRMR Results**

In [0]:
miq_results=pymrmr.mRMR(sdss_mrmr, 'MIQ', 15)

In [0]:
miq_results

**MID MRMR Results**

In [0]:
mid_results=pymrmr.mRMR(sdss_mrmr, 'MID', 16)

In [0]:
mid_results

**Fetching Data that will be used to optimize the meta learner**

In [0]:
opt_meta='https://raw.githubusercontent.com/rohandongare-nci/18120199-Data/master/Meta%20Learner%20Optimization/Optimize%20Meta%20Learner.csv'

In [0]:
sdss_meta_opt=pd.read_csv(opt_meta)

**NNMF**

In [0]:
sdss=sdss_meta_opt

**Removing camera related and ID values**

In [0]:
unwanted_columns = ['camcol','run','rerun','objid','specobjid','field']
sdss.drop(unwanted_columns, axis=1, inplace=True)
sdss.head()

**Correlation Plot**

In [0]:
import matplotlib.pyplot as plt
corr_plot = sdss.corr()
#Plot figsize
fig, axis = plt.subplots(figsize=(7, 7))
#Generate Color Map
colormap = sb.diverging_palette(950, 10, as_cmap=True)
#Generate Heat Map, allow annotations and place floats in map
sb.heatmap(corr_plot, cmap=colormap, annot=True, fmt=".2f")
plt.show()

In [0]:
sdss_colorbands=sdss[['u','g','r','i','z']]

In [0]:
u=sdss[['u']]
g=sdss[['g']]
r=sdss[['r']]
i=sdss[['i']]
z=sdss[['z']]

In [0]:
u=np.array(u)
g=np.array(g)
r=np.array(r)
i=np.array(i)
z=np.array(z)

In [0]:
u = u.flatten()
g = g.flatten()
r = r.flatten()
i = i.flatten()
z = z.flatten()

In [0]:
sdss_spectroscopic=sdss[['plate','mjd']]

In [0]:
sdss_both=sdss[['plate','mjd','u','g','r','i','z']]

**Bartlett's test of Sphericity**

In [0]:
#Checking the Bartletts hypothesis for both photometric and spectroscopic data together
chi_square_value,p_value= calculate_bartlett_sphericity(sdss_both)
chi_square_value, p_value

In [0]:
#Checking the Bartletts hypothesis for photometric data
chi_square_value,p_value= calculate_bartlett_sphericity(sdss_colorbands)
chi_square_value, p_value

In [0]:
#Checking the Bartletts hypothesis for photometric data
chi_square_value,p_value= calculate_bartlett_sphericity(sdss_spectroscopic)
chi_square_value, p_value

**Using scipy stats module**

In [0]:
import scipy.stats as st

In [0]:
st.bartlett(u,g,r,i,z)

**KMO test**

In [0]:
total_kmo_values,kmo_model=calculate_kmo(sdss_both)
kmo_model

In [0]:
#KMO test
total_kmo_values,kmo_model=calculate_kmo(sdss_colorbands)
kmo_model

In [0]:
#KMO test 
total_kmo_values,kmo_model=calculate_kmo(sdss_spectroscopic)
kmo_model

**Selecting the appropriate number of components**

In [0]:
sdss_nimfa_colour=np.array(sdss_colorbands)

In [0]:
sdss_nimfa_colour

In [0]:
sdss_nimfa_spec=np.array(sdss_spectroscopic)

In [0]:
sdss_nimfa_both=np.array(sdss_both)

**Using both spectroscopic and photometric data together**

**Rank 2**

In [0]:
nmf_both_2 = nimfa.Nmf(sdss_nimfa_both, rank=2, max_iter=30, update='euclidean',objective='fro')
both_sdss_nmf = nmf_colour_2()

In [0]:
sdss_band_colour = both_sdss_nmf.summary()
print('Residual Sum Of Squares: %5.3f' % sdss_band_colour['rss'])
print('Explained Variance: %5.3f' % sdss_band_colour['evar'])
print('Cophenetic correlation: %5.3f' % sdss_band_colour['cophenetic'])

**Rank 3**

In [0]:
nmf_both_3 = nimfa.Nmf(sdss_nimfa_both, rank=3, max_iter=30, update='euclidean',objective='fro')
both_sdss_nmf = nmf_both_3()

In [0]:
sdss_band_colour = both_sdss_nmf.summary()
print('Residual Sum Of Squares: %5.3f' % sdss_band_colour['rss'])
print('Explained Variance: %5.3f' % sdss_band_colour['evar'])
print('Cophenetic correlation: %5.3f' % sdss_band_colour['cophenetic'])

**Rank 4**

In [0]:
nmf_both_4 = nimfa.Nmf(sdss_nimfa_both, rank=4, max_iter=30, update='euclidean',objective='fro')
both_sdss_4 = nmf_both_4()

In [0]:
sdss_band_colour = both_sdss_4.summary()
print('Residual Sum Of Squares: %5.3f' % sdss_band_colour['rss'])
print('Evar: %5.3f' % sdss_band_colour['evar'])
print('Cophenetic correlation: %5.3f' % sdss_band_colour['cophenetic'])

**Rank 5**

In [0]:
nmf_both_5 = nimfa.Nmf(sdss_nimfa_both, rank=5, max_iter=30, update='euclidean',objective='fro')
both_sdss_5 = nmf_both_5()

In [0]:
sdss_band_colour = both_sdss_5.summary()
print('Residual Sum Of Squares: %5.3f' % sdss_band_colour['rss'])
print('Evar: %5.3f' % sdss_band_colour['evar'])
print('Cophenetic correlation: %5.3f' % sdss_band_colour['cophenetic'])

**Rank 6**

In [0]:
nmf_both_6 = nimfa.Nmf(sdss_nimfa_both, rank=5, max_iter=30, update='euclidean',objective='fro')
both_sdss_6 = nmf_both_6()

In [0]:
sdss_band_colour = both_sdss_6.summary()
print('Residual Sum Of Squares: %5.3f' % sdss_band_colour['rss'])
print('Evar: %5.3f' % sdss_band_colour['evar'])
print('Cophenetic correlation: %5.3f' % sdss_band_colour['cophenetic'])

**Using only the photometric data**

**Rank 2**

In [0]:
nmf_colour_2 = nimfa.Nmf(sdss_nimfa_colour, rank=2, max_iter=2, update='euclidean',objective='fro')
colour_sdss_nmf = nmf_colour_2()

In [0]:
sdss_band_colour2 = colour_sdss_nmf.summary()
print('Residual Sum Of Squares: %5.3f' % sdss_band_colour2['rss'])
print('Explained Variance: %5.3f' % sdss_band_colour2['evar'])
print('Cophenetic correlation: %5.3f' % sdss_band_colour2['cophenetic'])

**Rank 3**

In [0]:
color_3 = nimfa.Nmf(sdss_nimfa_colour, rank=3, max_iter=10, update='euclidean',objective='fro')
sdss_color_3 = color_3()

In [0]:
print(nmf_fit)

In [0]:
color_rank_3 = sdss_color_3.summary()
print('Residual Sum Of Squares: %5.3f' % color_rank_3['rss'])
print('Explained Variance: %5.3f' % color_rank_3['evar'])
print('Iterations: %d' % color_rank_3['n_iter'])
print('Cophenetic correlation: %5.3f' % color_rank_3['cophenetic'])

**Rank 4**

In [0]:
color_4 = nimfa.Nmf(sdss_nimfa_colour, rank=4, max_iter=10, update='euclidean',objective='fro')
sdss_color_4 = color_4()

In [0]:
color_rank_4 = sdss_color_4.summary()
print('Residual Sum Of Squares: %f' % color_rank_4['rss'])
print('Explained Variance: %f' % color_rank_4['evar'])
print('Copenhatic Distance: %5.3f' % color_rank_4['cophenetic'])

**NNMF**

In [0]:
from sklearn.decomposition import NMF
nnmf_model = NMF(n_components=3, init='random', random_state=18120199)

In [0]:
nnmf_components = nnmf_model.fit_transform(sdss_colorbands)

In [0]:
sdss = pd.concat((sdss, pd.DataFrame(nnmf_components)), axis=1)

In [0]:
print(sdss.head(10))

In [0]:
sdss.rename({0: 'color_component_1', 1: 'color_component_2', 2: 'color_component_3'}, axis=1, inplace = True)

In [0]:
sdss_colour_c=sdss[['color_component_1','color_component_2','color_component_3']]

In [0]:
enc = LabelEncoder()
X=sdss[['ra','dec','color_component_1','color_component_2','color_component_3','redshift','fiberid','plate','mjd']]
y = enc.fit_transform(sdss['class'])
X.head()

**Spectroscopic Data** 

In [0]:
nnmf_spec_model = NMF(n_components=1, init='random', random_state=18120199)

In [0]:
nnmf_spec_components = nnmf_spec_model.fit_transform(sdss_spectroscopic)

In [0]:
sdss = pd.concat((sdss, pd.DataFrame(nnmf_spec_components)), axis=1)

In [0]:
sdss.rename({0: 'spectroscopic_component'}, axis=1, inplace = True)

In [0]:
sdss

In [0]:
enc = LabelEncoder()
X=sdss[['ra','dec','spectroscopic_component','color_component_1','color_component_2','color_component_3','redshift','fiberid']]
#y=sdss[['class']]
y = enc.fit_transform(sdss['class'])
X.head()

**NNMF both spectroscopic and photometric together**

In [0]:
from sklearn.decomposition import NMF
nnmf_model = NMF(n_components=4, init='random', random_state=18120199)

In [0]:
nnmf_components_both = nnmf_model.fit_transform(sdss_both)

In [0]:
sdss = pd.concat((sdss, pd.DataFrame(nnmf_components_both)), axis=1)

In [0]:
print(sdss.head(10))

In [0]:
sdss.rename({0: 'nmf_component_1', 1: 'nmf_component_2', 2: 'nmf_component_3', 3: 'nmf_component_4'}, axis=1, inplace = True)

In [0]:
enc = LabelEncoder()
X=sdss[['ra','dec','nmf_component_1','nmf_component_2','nmf_component_3','nmf_component_4','redshift','fiberid']]
#y=sdss[['class']]
y = enc.fit_transform(sdss['class'])
X.head()

**Data Normalization**

In [0]:
X= preprocessing.normalize(X, norm='l2')

**Power Transformation with standardization**

In [0]:
power_transform_yj = preprocessing.PowerTransformer(method='yeo-johnson', standardize=True)

In [0]:
X=power_transform_yj.fit_transform(X)

In [0]:
x_plot=pd.DataFrame(X)

In [0]:
x_plot

In [0]:
sb.kdeplot(x_plot[5], shade=True)

In [0]:
for col in x_plot[col]:
    sb.kdeplot(x_plot[col], shade=True)

**Without NNMF**

In [0]:
enc = LabelEncoder()
X=sdss[['ra','dec','u','g','r','i','z','redshift','plate','mjd','fiberid']]
y=sdss[['class']]
y = enc.fit_transform(sdss['class'])
X.head()

In [0]:
print(X)

**Random Undersampling**

In [0]:
from collections import Counter
Counter(y)

In [0]:
from imblearn.under_sampling import RandomUnderSampler

In [0]:
undersampler = RandomUnderSampler(random_state=18120199,replacement=False)
X_under, y_under = undersampler.fit_resample(X, y)

In [0]:
Counter(y_under)

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X_under, y_under, test_size = 0.25, random_state = 18120199)

In [0]:
X_test.shape

**Stratified K-Fold**

In [0]:
k = 10
str_kf = StratifiedKFold(n_splits=k, random_state=18120199, shuffle=False)

**Defining a function for stacking the models**

In [0]:
def get_stack_pred(classifier):
  #Initial train-test split- train sample will be used for k-fold, test sample will be used as the holdout set
  x_train, x_test, y_train, y_test = train_test_split(X_under, y_under, test_size = 0.25, random_state = 18120199)
  #initializing arrays to store predictions
  if_test=np.empty((x_train.shape[0],))
  oof_kfold = np.empty((k, x_test.shape[0]))
  infoldx_train=[]
  infoldy_train=[]
  infold_test=[]
  infoldy_test=[]
  #assigning train-test indices for every fold
  for j, (train_index, test_index) in enumerate(str_kf.split(x_train, y_train)):
    infoldx_train = x_train[train_index]
    infoldy_train = y_train[train_index]
    infoldx_test   = x_train[test_index]
    infoldy_test  = y_train[test_index]
    #print(infoldx_test.shape[0])
    #training the classifier on the infold train/test set 
    classifier.fit(infoldx_train,infoldy_train)
    #infold test prediction
    if_test[test_index]=classifier.predict(infoldx_test)
    infold_pred=classifier.predict(infoldx_test)
    #Infold classifier performance
    infold_accuracy = metrics.accuracy_score(infold_pred, infoldy_test)
    print(f"In Fold classifier accuracy: {infold_accuracy}")
    #holdout fold test prediction
    pred2=classifier.predict(x_test)
    #holdout set classifier performance
    out_of_fold_accuracy = metrics.accuracy_score(pred2,y_test)
    print(f"Out Of Fold accuracy: {out_of_fold_accuracy}")
    #adding predictions of the holdout set for every fold row-wise to an array
    oof_kfold[j,:]=classifier.predict(x_test)
  #taking mode of the holdout set predictions of each row
  mode=stats.mode(oof_kfold,axis=0)
  oof_kfold=mode[0]
  '''returning transposed columns so that predictions of each base learner 
  will later after appending them will have its own column'''
  return if_test.reshape(-1,1), oof_kfold.reshape(-1,1)

**Keras Classifier**

In [0]:
# Function to create model, required for KerasClassifier
def base_model():
  model_sdss_base = Sequential()
  model_sdss_base.add(Dense(1024, input_shape=(9,),kernel_initializer='random_uniform'))
  model_sdss_base.add(LeakyReLU(alpha=0.001))
  model_sdss_base.add(Dropout(rate=0.2))
  model_sdss_base.add(Dense(512))
  model_sdss_base.add(LeakyReLU(alpha=0.001))
  model_sdss_base.add(Dropout(rate=0.2))
  model_sdss_base.add(Dense(512))
  model_sdss_base.add(LeakyReLU(alpha=0.001))
  model_sdss_base.add(Dropout(rate=0.2))
  adamopt=optimizers.Adam(lr=10**-3)
  model_sdss_base.add(Dense(3, activation='softmax'))
  model_sdss_base.compile(loss='categorical_crossentropy',
                    optimizer=adamopt, 
                    metrics=['accuracy'])
  return model_sdss_base

In [0]:
nn_els = EarlyStopping(monitor='acc', mode='max', verbose=1, patience=5)
nn_mck = ModelCheckpoint('best_observed_model.h5', monitor='acc', mode='max', verbose=1, save_best_only=True)
nn_model = KerasClassifier(build_fn=base_model, epochs=30, batch_size=300, verbose=1,callbacks=[nn_els,nn_mck])

In [0]:
nn_train, nn_test = get_stack_pred(nn_model)

In [0]:
nn_train, nn_test = get_stack_pred(nn_model)

In [0]:
class_names = ['Galaxy', 'Quasar', 'Star']

In [0]:
from sklearn.metrics import confusion_matrix
cm_nn = confusion_matrix(Y_test, nn_test)
print(cm_nn)

In [0]:
print(classification_report(Y_test, nn_test),target_names=class_names)

In [0]:
df_nn_cm = pd.DataFrame(cm_nn,index=['Galaxy','Quasar','Star'],
                  columns = ['Galaxy','Quasar','Star'])
sb.heatmap(df_nn_cm, annot=True,fmt="g",cmap="Blues")

**Binomial test to check if the output of the base learner was not a result of pure chance**

In [0]:
stats.binom_test(58870, n=60615, p=0.973)

In [0]:
stats.binom_test(58870, n=60615, p=0.97,alternative="greater")

**Random Forest were run just to compare their performance with Extra trees**

In [0]:
sdss_rf = RandomForestClassifier(criterion='gini',max_features='sqrt',min_samples_leaf=36,n_estimators=50,min_samples_split=2,n_jobs=-1,random_state=18120199)

In [0]:
rf_train, rf_test = get_predictions1(sdss_rf)

In [0]:
from sklearn.metrics import confusion_matrix
cm_rf = confusion_matrix(Y_test, rf_test)
print(cm_rf)

**Extra Tree Classifier**

In [0]:
from sklearn.ensemble import ExtraTreesClassifier

In [0]:
et_base = ExtraTreesClassifier(n_estimators=29,max_features= 0.95,criterion= 'entropy',min_samples_split= 2,
                            max_depth= 50, min_samples_leaf= 1)      

In [0]:
start_et=time.time()
et_train, et_test=get_stack_pred(et_base)
end_et=time.time()

In [0]:
et_time=end_et-start_et
print(et_time)

In [0]:
from sklearn.metrics import confusion_matrix
cm_et = confusion_matrix(Y_test, et_test)
print(cm_et)

In [0]:
df_et_cm = pd.DataFrame(cm_et,index=['Galaxy','Quasar','Star'],
                  columns = ['Galaxy','Quasar','Star'])
sb.heatmap(df_et_cm, annot=True,fmt="g",cmap="Blues")

In [0]:
class_name = ['Galaxy', 'Quasar', 'Star']
print(classification_report(Y_test, et_test,taget_names=class_name))

**Binomial Test**

In [0]:
stats.binom_test(59034, n=60615, p=0.97, alternative="greater")

In [0]:
stats.binom_test(59034, n=60615, p=0.973)

In [0]:
stats.binom_test(59265, n=60615, p=0.970,alternative="greater")

**Quadratic Discriminant Analysis**

In [0]:
sdss_qda = QuadraticDiscriminantAnalysis(tol=0.0001,reg_param=0.005)

In [0]:
qda_train, qda_test = get_stack_pred(sdss_qda)

In [0]:
from sklearn.metrics import confusion_matrix
cm_qda = confusion_matrix(Y_test, qda_test)
print(cm_qda)

In [0]:
#Plotting confusion matrix for QDA
df_qda_cm = pd.DataFrame(cm_qda,index=['Galaxy','Quasar','Star'],
                  columns = ['Galaxy','Quasar','Star'])
sb.heatmap(df_qda_cm, annot=True,fmt="g",cmap="Blues")

In [0]:
print(classification_report(Y_test, qda_test))

**Binomial Test**

In [0]:
stats.binom_test(58092, n=60615, p=0.956, alternative="greater")

In [0]:
stats.binom_test(58092, n=60615, p=0.957)

In [0]:
y1_test = np.asarray(Y_test)
sdss_qda.fit(X_train,Y_train)
misclassified = np.where(y1_test != sdss_qda.predict(X_test))

In [0]:
X_test[misclassified]

**XGBoost**

In [0]:
xgb_model = XGBClassifier(objective="multi:softprob", random_state=18120199,n_estimators=30,n_jobs=-1,learning_rate=0.2,max_depth=10,scale_pos_weight=1,subsample=0.6,gamma=0.001,reg_alpha=0.005,min_child_weight=89)

In [0]:
xgb_model_1=XGBClassifier(objective="multi:softprob",learning_rate=0.00331,colsample_bylevel=0.6097,colsample_bynode=1,colsample_bytree=0.735,gamma=4.117,max_delta_step=0,max_depth=6,min_child_weight=89,n_estimators=5000, reg_alpha=0.169,subsample=0.602,reg_lambda=3.06,scale_pos_weight=1)

In [0]:
start=time.time()
xgb_train, xgb_test = get_stack_pred(xgb_model)
end=time.time()

In [0]:
xgb_time=end-start
print(xgb_time)

In [0]:
from sklearn.metrics import confusion_matrix
cm_xgb = confusion_matrix(Y_test, xgb_test)
print(cm_xgb)

In [0]:
df_xgb_cm = pd.DataFrame(cm_xgb,index=['Galaxy','Quasar','Star'],
                  columns = ['Galaxy','Quasar','Star'])
sb.heatmap(df_xgb_cm, annot=True,fmt="g",cmap="Blues")

In [0]:

print(classification_report(Y_test, xgb_test))

**Binomial Test**

In [0]:
stats.binom_test(59056, n=60615, p=0.973, alternative="greater") 

In [0]:
stats.binom_test(59056, n=60615, p=0.974)

**SVM**

In [0]:
sdss_svm = svm.SVC(gamma=0.4, decision_function_shape='ovr',shrinking=True,kernel='rbf',C=7.64)

In [0]:
start_svm=time.time()
svm_train, svm_test = get_stack_pred(sdss_svm)
end_svm=time.time()

In [0]:
time=start_svm

In [0]:
from sklearn.metrics import confusion_matrix
cm_svm = confusion_matrix(Y_test, svm_test)
print(cm_svm)

In [0]:
df_svm_cm = pd.DataFrame(cm_svm,index=['Galaxy','Quasar','Star'],
                  columns = ['Galaxy','Quasar','Star'])
sb.heatmap(df_svm_cm, annot=True,fmt="g",cmap="Blues")

In [0]:
print(classification_report(Y_test, svm_test))

**Binomial Test**

In [0]:
stats.binom_test(58949, n=60615, p=0.971, alternative="greater")

In [0]:
stats.binom_test(58949, n=60615, p=0.972)

**Cochrans Q statistical test to test if the output of the base learners are uncorrelated** 

In [0]:
!pip install mlxtend 

In [0]:
nn_f=nn_test.flatten('F')
et_f=et_test.flatten('F')
qda_f=qda_test.flatten('F')
xgb_f=xgb_test.flatten('F')
svm_f=svm_test.flatten('F')

In [0]:
print(nn_test_f)

In [0]:
from mlxtend.evaluate import cochrans_q

In [0]:
chi_square, pvalue = cochrans_q(nn_f,et_f,qda_f,xgb_f,svm_f)
print('Chisquare value: %.3f' % chi_square)
print('pvalue: %f' % pvalue)

**Kruskal Wallice H test**

In [0]:
from scipy import stats

In [0]:
stats.kruskal(nn_test,qda_test,svm_test,xgb_test,et_test)

**Wilcoxon sign rank test**

In [0]:
#prediction result arrays need to be flattened before conducting tests on them 
nn_f=nn_test.flatten('F')
et_f=et_test.flatten('F')
qda_f=qda_test.flatten('F')
xgb_f=xgb_test.flatten('F')
svm_f=svm_test.flatten('F')

In [0]:
#Wilcoxon signed rank test between Neural Network and Extra trees
from scipy.stats import wilcoxon
wilcox_rank_nn_et, pvalue_nn_et = wilcoxon(nn_f,et_f)
print(wilcox_rank_nn_et)
print(pvalue_nn_et)

In [0]:
#Wilcoxon signed rank test between Neural Network and SVM
wilcox_rank_nn_svm, pvalue_nn_svm = wilcoxon(nn_f,svm_f)
print(wilcox_rank_nn_svm)
print(pvalue_nn_svm)

In [0]:
#Wilcoxon signed rank test between Neural Network and QDA
wilcox_rank_nn_qda, pvalue_nn_qda = wilcoxon(nn_f,qda_f)
print(wilcox_rank_nn_qda)
print(pvalue_nn_qda)

In [0]:
#Wilcoxon signed rank test between Neural Network and XGBoost
wilcox_rank_nn_xgb, pvalue_nn_xgb = wilcoxon(nn_f,xgb_f)
print(wilcox_rank_nn_xgb)
print(pvalue_nn_xgb)

In [0]:
#Wilcoxon signed rank test between Extra trees and QDA
wilcox_rank_et_qda, pvalue_et_qda = wilcoxon(qda_f,et_f)
print(wilcox_rank_et_qda)
print(pvalue_et_qda)

In [0]:
#Wilcoxon signed rank test between Extra trees and XGB
wilcox_rank_et_xgb, pvalue_et_xgb = wilcoxon(xgb_f,et_f)
print(wilcox_rank_et_xgb)
print(pvalue_et_xgb)

In [0]:
#Wilcoxon signed rank test between Extra trees and SVM
wilcox_rank_et_svm, pvalue_et_svm = wilcoxon(svm_f,et_f)
print(wilcox_rank_et_svm)
print(pvalue_et_svm)

In [0]:
#Wilcoxon signed rank test between XGB and SVM
wilcox_rank_xgb_svm, pvalue_xgb_svm = wilcoxon(xgb_f,svm_f)
print(wilcox_rank_xgb_svm)
print(pvalue_xgb_svm)

In [0]:
#Wilcoxon signed rank test between XGB and QDA
wilcox_rank_xgb_qda, pvalue_xgb_qda = wilcoxon(xgb_f,qda_f)
print(wilcox_rank_xgb_qda)
print(pvalue_xgb_qda)

In [0]:
#Wilcoxon signed rank test between SVM and QDA
wilcox_rank_svm_qda, pvalue_svm_qda = wilcoxon(svm_f,qda_f)
print(wilcox_rank_svm_qda)
print(pvalue_svm_qda)

**Combining Classifiers**

In [0]:
train_meta=np.concatenate((nn_train,et_train,qda_train,xgb_train,svm_train,),axis=1)
test_meta=np.concatenate((nn_test,et_test,qda_test,xgb_test,svm_test),axis=1)

In [0]:
train_meta.shape

In [0]:
X_train.shape

In [0]:
Y_train.shape

In [0]:
test_meta.shape

In [0]:
Y_test.shape

**Perceptron as Meta-Learner**

In [0]:
from sklearn.linear_model import Perceptron

In [0]:
single_percep = Perceptron(tol=1e-3, random_state=0)
single_percep.fit(train_meta,Y_train)

In [0]:
single_percep.score(test_meta,Y_test)

In [0]:
print(Y_train)

**Meta Learner Optimization**

**Saving Files to be upladed to Git as the Optimization function does not accept values out of the data function**

In [0]:
from google.colab import files

In [0]:
col=['nn','et','qda','xgb','svm']

In [0]:
train_meta_pd=pd.DataFrame(train_meta,columns=col)
test_meta_pd=pd.DataFrame(test_meta,columns=col)
Y_train_pd=pd.DataFrame(Y_train)
Y_test_pd=pd.DataFrame(Y_test)

In [0]:
train_meta_pd.to_csv('train_metapd.csv',index = False)
test_meta_pd.to_csv('test_metapd.csv',index = False)
Y_train_pd.to_csv('ytrainpd.csv',index = False)
Y_test_pd.to_csv('ytestpd.csv',index = False)

In [0]:
files.download('train_metapd.csv')

In [0]:
files.download('test_metapd.csv')

In [0]:
files.download('ytrainpd.csv')

In [0]:
files.download('ytestpd.csv')

**Optimizing the Meta learner using Bayesian Optimiztion**

In [0]:
!pip install hyperas
!pip install hyperopt

In [0]:
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform
from keras import utils as cat_utils

Using TensorFlow backend.


In [0]:
def meta_opt_data():
    import keras
    from keras import utils as cat_utils
    from sklearn.preprocessing import LabelEncoder, PowerTransformer
    from keras.models import Sequential
    import time
    from keras.layers import Dense
    from keras.layers import Dropout
    from keras.layers import Activation
    import pandas as pd
    #importing files from my github repository for optimizing the meta learner
    train_meta = 'https://raw.githubusercontent.com/rohandongare-nci/18120199-Data/master/New-Base-learner-data/train_meta_base_op.csv'
    y_train='https://raw.githubusercontent.com/rohandongare-nci/18120199-Data/master/New-Base-learner-data/y_train.csv'
    test_meta='https://raw.githubusercontent.com/rohandongare-nci/18120199-Data/master/New-Base-learner-data/test_meta_base_op.csv'
    y_test='https://raw.githubusercontent.com/rohandongare-nci/18120199-Data/master/New-Base-learner-data/y_test.csv'
    xtrain = pd.read_csv(train_meta)
    ytrain=pd.read_csv(y_train)
    xval=pd.read_csv(test_meta)
    yval=pd.read_csv(y_test)
    ytrain = cat_utils.to_categorical(ytrain, 3)
    yval = cat_utils.to_categorical(yval, 3)
    return xtrain, ytrain, xval, yval

In [0]:
def stack_opt_model(xtrain, ytrain, xval, yval):
    model_meta_opt = Sequential()
    model_meta_opt.add(Dense({{choice([64,128,256,512,1028])}}, input_shape=(5,)))
    model_meta_opt.add(Activation({{choice(['relu', 'sigmoid','tanh'])}}))
    model_meta_opt.add(Dropout({{uniform(0, 1)}}))
    if {{choice(['one', 'two'])}} == 'two':
      model_meta_opt.add(Dense({{choice([64,128,256,512,1028])}}))
      model_meta_opt.add(Activation({{choice(['relu', 'sigmoid','tanh'])}}))
      model_meta_opt.add(Dropout({{uniform(0, 1)}}))
    model_meta_opt.add(Dense(3))
    model_meta_opt.add(Activation('softmax'))
    model_meta_opt.compile(loss='categorical_crossentropy',
                  optimizer={{choice(['rmsprop', 'adam', 'sgd'])}},
                  metrics=['accuracy'])
    model_meta_opt.fit(xtrain, ytrain,
              batch_size={{choice([150,300])}},
              nb_epoch=20,
              verbose=1,
              validation_data=(xval, yval))
    meta_score, meta_acc = model_meta_opt.evaluate(xval, yval, verbose=1)
    print('Test accuracy:', meta_acc)
    return {'loss': -meta_acc, 'status': STATUS_OK, 'model': model_meta_opt}

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
#xtrain, ytrain, xval, yval = get_data()
best_meta_run, best_meta_model = optim.minimize(stack_opt_model,
                                      data=meta_opt_data,
                                      algo=tpe.suggest,
                                      max_evals=30,
                                      trials=Trials(),
                                      notebook_name='optimize_meta_learner')

In [0]:
print(best_meta_run)

In [0]:
best_meta_model