In [1]:
from collections import defaultdict
import pandas
from sklearn import ensemble
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
import numpy as np
from tensorflow import keras
import sklearn.metrics as mx
from matplotlib import pyplot as plt
import xgboost as xgb

In [2]:
def score_model(X_train, X_test, y_train, y_test, clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return mx.accuracy_score(y_test, y_pred)

def ablation_test(df, label_type):
    df = remove_unusable_features(df, label_type)
    X = df.drop([label_type], axis=1)
    y = df[label_type]

    gnb = GaussianNB()
    svc = SVC(kernel='rbf', gamma=1, C=0.1, random_state=0)
    neigh = KNeighborsClassifier(n_neighbors=6, p=2, weights='uniform')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    base_score = dict()
    base_score['gnb'] = self.__score_model__(X_train, X_test, y_train, y_test, gnb)
    base_score['svc'] = self.__score_model__(X_train, X_test, y_train, y_test, svc)
    base_score['neigh'] = self.__score_model__(X_train, X_test, y_train, y_test, neigh)

    scores = defaultdict(list)
    for i in range(X_train.shape[1]):
        cols = [ndx != i for ndx in range(X_train.shape[1])]
        scores['gnb'].append(self.__score_model__(X_train.iloc[:, cols], X_test.iloc[:, cols], y_train, y_test, gnb))
        scores['svc'].append(self.__score_model__(X_train.iloc[:, cols], X_test.iloc[:, cols], y_train, y_test, svc))
        scores['neigh'].append(self.__score_model__(X_train.iloc[:, cols], X_test.iloc[:, cols], y_train, y_test, neigh))

    final_scores_gnb = dict()
    final_scores_svc = dict()
    final_scores_neigh = dict()
    for k, v in scores.items():
        if k == 'gnb':
            for i in range(len(v)):
                final_scores_gnb[X.columns[i]] = (v[i] - base_score[k])
            final_scores_gnb = sorted(final_scores_gnb.items(), key=lambda kv: kv[1], reverse=True)
        elif k == 'svc':
            for i in range(len(v)):
                final_scores_svc[X.columns[i]] = (v[i] - base_score[k])
            final_scores_svc = sorted(final_scores_svc.items(), key=lambda kv: kv[1], reverse=True)
        elif k == 'neigh':
            for i in range(len(v)):
                final_scores_neigh[X.columns[i]] = (v[i] - base_score[k])
            final_scores_neigh = sorted(final_scores_neigh.items(), key=lambda kv: kv[1], reverse=True)

    print('Based on NB: ', final_scores_gnb)
    print('Based on SVC : ', final_scores_svc)
    print('Based on KNN: ', final_scores_neigh)

    

In [3]:
def remove_unusable_features(df, label_type):
    if label_type == 'pvalue.label':
        df = df.drop(['P.value.R', 'Direction.R', 'O.within.CI.R', 'Meta.analysis.significant'], axis=1)
    elif label_type == 'O.within.CI.R':
        df = df.drop(['P.value.R', 'Direction.R', 'Meta.analysis.significant', 'pvalue.label'], axis=1)
    elif label_type == 'Meta.analysis.significant':
        df = df.drop(['P.value.R', 'Direction.R', 'O.within.CI.R', 'pvalue.label'], axis=1)

    cols_drop = set(['DOI', '1st.author.O', 'Senior.author.O', 'Authors.O', 'Study.Title.O', 'Unnamed: 0', 'new_feature_301', 'Unnamed: 0.1'])
    cols_total = set(df.columns)
    df = df.drop(cols_drop.intersection(cols_total), axis=1)
    # df = df.replace(to_replace=np.nan, value=0)
    df = df.dropna()
    print('Shape is: ', df.shape)
    return df


In [4]:
def get_baseline(df, label_type):
    total_true = total_false = total_val = 0
    for i, row in df.iterrows():
        total_val += 1
        if row[label_type] == 1:
            total_true += 1
        else:
            total_false += 1
    print("Total rows is= ", total_val)
    print("total_true % is= ", (total_true / total_val) * 100)
    print("total_false % is=", (total_false / total_val) * 100)

In [5]:
def select_best_features_chi2(df, label_type):
    df = remove_unusable_features(df, label_type)
    X = df.drop([label_type], axis=1)
    cols = X.columns
    X = MinMaxScaler().fit_transform(X)
    y = df[label_type]
    bestfeatures = SelectKBest(score_func=chi2, k=10)
    fit = bestfeatures.fit(X, y)
    dfscores = pandas.DataFrame(fit.scores_)
    dfcolumns = pandas.DataFrame(cols)
    featureScores = pandas.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Specs', 'Score']
    final_features = featureScores.nlargest(featureScores.shape[0] - 2, 'Score')
    pandas.set_option('display.max_rows', final_features.shape[0] + 1)
    return final_features, df


In [6]:
def modelling(df, label_type, label_dict):
    print('------------------------', 'Socre for: ', label_type, '------------------------')
    df = remove_unusable_features(df, label_type)
    get_baseline(df, label_type)
    # X = df.drop([label_type], axis=1)
    X = df[['new_feature_63', 'new_feature_93', 'new_feature_78', 'new_feature_139', 'new_feature_1',
                     'new_feature_75', 'new_feature_292', 'new_feature_42', 'new_feature_111', 'new_feature_183']]
        
    y = df[label_type]
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    gnb = GaussianNB()
    svc = SVC(kernel='rbf', gamma=0.9, C=1, random_state=0)
    neigh = KNeighborsClassifier(n_neighbors=5, p=2, weights='uniform')
    forest = ensemble.RandomForestClassifier(random_state=0, n_estimators=10, max_features='auto', max_depth=10,
                                             min_samples_split=2, min_samples_leaf=1, bootstrap=True)

    xgboost = xgb.XGBClassifier(max_depth=7, objective='binary:logistic', learning_rate=1, colsample_bytree=1, reg_alpha=5,
                                booster='gbtree', random_state=0)
    gnb_score = np.mean(cross_val_score(gnb, X, y, cv=skf, n_jobs=1))
    svc_score = np.mean(cross_val_score(svc, X, y, cv=skf, n_jobs=1))
    neigh_score = np.mean(cross_val_score(neigh, X, y, cv=skf, n_jobs=1))
    forest_score = np.mean(cross_val_score(forest, X, y, cv=skf, n_jobs=1))
    xgboost_score = np.mean(cross_val_score(xgboost, X, y, cv=skf, n_jobs=1))
    print("Cross Validation Score of NB is: %.2f" % gnb_score)
    print("Cross Validation Score of SVC is: %.2f" % svc_score)
    print("Cross Validation Score of KNN is: %.2f" % neigh_score)
    print("Cross Validation Score of Random Forest is: %.2f" % forest_score)
    print("Cross Validation Score of XGB is: %.2f" % xgboost_score)

    acc_arr = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model = keras.Sequential([
            keras.layers.Dense(16, input_dim=X.shape[1], activation='sigmoid'),
            keras.layers.Dense(8, activation='sigmoid'),
            keras.layers.Dense(16, activation='sigmoid'),
            keras.layers.Dense(8, activation='sigmoid'),
            keras.layers.Dense(1, activation='sigmoid')
        ])
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=0)
        _, accuracy = model.evaluate(X_test, y_test)
        acc_arr.append(accuracy)
    print('Accuracy of this neural network model is: %.2f' % np.mean(acc_arr))
    print(acc_arr, '\n\n')
    label_dict['Naive_Bayes'].append(gnb_score)
    label_dict['SVC'].append(svc_score)
    label_dict['KNN'].append(neigh_score)
    label_dict['Random_Forest'].append(forest_score)
    label_dict['XGBoost'].append(xgboost_score)
    label_dict['Neural_Network'].append(np.mean(acc_arr))

In [7]:
def plot_feature_graph(df, label_type, cols, clf):
    i = 10
    sctr_plot = dict()
    y = df[label_type]
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    while i < cols.shape[0]:
        lst = cols['Specs'].head(i)
        X = df[lst]
        sctr_plot[i] = np.mean(cross_val_score(clf, X, y, cv=skf, n_jobs=1))
        i += 1
    x = sctr_plot.keys()
    y = sctr_plot.values()
    plt.scatter(x, y)
    plt.title(label_type + '_' + str(clf).split('(')[0])
    plt.xlabel("Number of Features")
    plt.ylabel("Accuracy")
    plt.show()


In [8]:
all_labels = ['pvalue.label', 'O.within.CI.R', 'Meta.analysis.significant']
fileName='data/final_network_data.xlsx'
df = pandas.read_excel(fileName, encoding='ansi')

In [9]:
label_dict = defaultdict(list)
for label in all_labels:
    modelling(df, label, label_dict)

Shape is:  (60, 319)
Total rows is=  60
total_true % is=  41.66666666666667
total_false % is= 58.333333333333336
------------------------ Socre for:  pvalue.label ------------------------
Cross Validation Score of NB is: 0.75
Cross Validation Score of SVC is: 0.80
Cross Validation Score of KNN is: 0.72
Cross Validation Score of Random Forest is: 0.78
Cross Validation Score of XGB is: 0.77
Accuracy of this neural network model is: 0.78
[0.6666667, 0.8333333, 0.6666667, 0.8333333, 0.8333333, 1.0, 0.6666667, 0.6666667, 0.6666667, 1.0] 


Shape is:  (60, 319)
------------------------ Socre for:  O.within.CI.R ------------------------
Cross Validation Score of NB is: 0.75
Cross Validation Score of SVC is: 0.75
Cross Validation Score of KNN is: 0.70
Cross Validation Score of Random Forest is: 0.68
Cross Validation Score of XGB is: 0.63
Accuracy of this neural network model is: 0.73
[0.6666667, 0.8333333, 0.33333334, 0.6666667, 0.8333333, 0.8333333, 0.6666667, 1.0, 0.8333333, 0.6666667] 


Sh

In [15]:
print('Aggregated values of all the models are:')
for k, v in label_dict.items():
    label_dict[k] = np.mean(v)
    print(k, ': %.2f'% label_dict[k])

Aggregated values of all the models are:
Naive_Bayes : 0.77
SVC : 0.77
KNN : 0.69
Random_Forest : 0.75
XGBoost : 0.72
Neural_Network : 0.76


In [10]:
# Various Tests
# print(features)
# ablation_test(df, label_type)

In [11]:
# for i in all_labels:
#     gnb = GaussianNB()
#     svc = SVC(kernel='rbf', gamma=0.9, C=1, random_state=0)
#     neigh = KNeighborsClassifier(n_neighbors=5, p=2, weights='uniform')
#     forest = ensemble.RandomForestClassifier(random_state=0, n_estimators=10, max_features='auto', max_depth=10,
#                                              min_samples_split=2, min_samples_leaf=1, bootstrap=True)

#     xgboost = xgb.XGBClassifier(max_depth=7, objective='binary:logistic', learning_rate=1, colsample_bytree=1, reg_alpha=5,
#                                 booster='gbtree')
#     for j in [gnb, svc, neigh, forest, xgboost]:
#         features, df = select_best_features_chi2(df, i)
#         plot_feature_graph(df, i, features, j)