# Student ID: 2487190G

# ANOVA Test (Initial run)

In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import statistics
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE, SelectKBest
from sklearn.feature_selection import f_classif
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneGroupOut

In [2]:
data_labels_df=pd.read_csv("labels.csv",header=None)
data_features_df=pd.read_csv("data.csv", header=None)
data_labels_df.columns=['labels']
data_columns_list=list(pd.read_csv("feature_names.csv"))

data_features_df.columns=data_columns_list #merging the columns file and data file

In [3]:
#helper function to get generate a final validation dataset for final classifier comparison
#it uses the same the concept of LeaveOneGroupOut

def custom_train_test_split(x,y,split_group_size=3,random_state=123):
    
    '''
    Parameters:
    x =>Feature (dataframe)
    y =>Label (dataframe)
    split_group_size => The groups we want in our validation set
    random_state => random.seed()
    
    ----------------------------------------------------------------------------
    This function splits the data into test and train splits. This works like Leave One Group Out concept where it doesn't
    split random rows but splits random groups.
    
    This function ensures that the test set will have both the classes (1s and 0s).
    
    -----------------------------------------------------------------------------
    Returns dataframes and a variable
    
    
    '''
        
    random.seed(random_state)  
    choice_list=[x for x in range(18)]
    groups=[x for x in range(18)]*10 #for using in LeaveOneGroupOut #180 length
    groups.sort()
    groups=np.array(groups).reshape(180,1) #groups.sort is sorted list which then converted to numpy array
    groups_df=pd.DataFrame(groups,columns=['groups'])
    
    main_data_df=pd.concat([x,y], axis=1) #it contains features with last column as target variable
    main_data_df=pd.concat([main_data_df,groups_df],axis=1) #the last column is the groups column now
    
    test_df=pd.DataFrame()
    train_df=pd.DataFrame()
    
    #Objective is to always select both the classes in Test data:
    
    if split_group_size==2:
        #selecting positive class for the test data
        temp_df=main_data_df[main_data_df['labels']==1] #selecting a portion of the main_df where labels column are 1
        random_patient=random.choice(list(set(temp_df['groups'].values.tolist()))) #selecting the group randomly  from temp_df and randomly picking a group
        test_df=pd.concat([temp_df[temp_df['groups']==random_patient],test_df], axis=0) #this selects that random patient rows and store in test_df
        main_data_df.drop(main_data_df[main_data_df['groups']==random_patient].index, inplace=True)
        #this line will remove that patient rows from main_data_df. This patient is a POSITIVE class
        
        #selecting negative class for the test data 
        temp_df=main_data_df[main_data_df['labels']==0] #selecting a portion of the main_df where labels column are 0
        random_patient=random.choice(list(set(temp_df['groups'].values.tolist()))) #selecting the group randomly  from temp_df and randomly picking a group
        test_df=pd.concat([temp_df[temp_df['groups']==random_patient],test_df], axis=0) #this selects that random patient rows and store in test_df
        main_data_df.drop(main_data_df[main_data_df['groups']==random_patient].index, inplace=True)
        #this line will remove that patient rows from main_data_df. This patient is a NEGATIVE class
        
    else:
        #must select atleast one 1 class and one 0 class
        #selecting positive class for the test data
        temp_df=main_data_df[main_data_df['labels']==1] #selecting a portion of the main_df where labels column are 1
        random_patient=random.choice(list(set(temp_df['groups'].values.tolist()))) #selecting the group randomly  from temp_df and randomly picking a group
        test_df=pd.concat([temp_df[temp_df['groups']==random_patient],test_df], axis=0) #this selects that random patient rows and store in test_df
        main_data_df.drop(main_data_df[main_data_df['groups']==random_patient].index, inplace=True)
        #this line will remove that patient rows from main_data_df. This patient is a POSITIVE class
        
        #selecting negative class for the test data 
        temp_df=main_data_df[main_data_df['labels']==0] #selecting a portion of the main_df where labels column are 0
        random_patient=random.choice(list(set(temp_df['groups'].values.tolist()))) #selecting the group randomly  from temp_df and randomly picking a group
        test_df=pd.concat([temp_df[temp_df['groups']==random_patient],test_df], axis=0) #this selects that random patient rows and store in test_df
        main_data_df.drop(main_data_df[main_data_df['groups']==random_patient].index, inplace=True)
        #this line will remove that patient rows from main_data_df. This patient is a NEGATIVE class
        
        #Two groups selected for test data already above, remaining group(s) will be random selection:
           
    
        for i in range(split_group_size-2):
            random_pick=random.choice(list(set(main_data_df['groups'].values.tolist())))  #picking a random number from the groups column
            #choice_list.remove(random_pick) #removing it from the choice_list so that in the next iteration the same number doesn't get picked
            test_df=pd.concat((main_data_df[main_data_df['groups']==random_pick],test_df)) #concatenating X_test on top of X_test in each iteration to build the test set
            main_data_df.drop(main_data_df[main_data_df['groups']==random_pick].index, inplace=True) #Permanently dropping this patient group in each iteration from the main_data_df
            
    
     #now main_data_df won't have the test patient groups data in it, so it is our train set dataframe
        
        
        
    X_train_df=main_data_df.iloc[:,0:-2] #the last two columns are target label and a group column added in the previous lines 
    Y_train_df=main_data_df.iloc[:,-2] #the 2nd last column is target data
    X_test_df= test_df.iloc[:,0:-2]
    Y_test_df= test_df.iloc[:,-2]
    groups=    18-split_group_size
    #groups variable will be fed to LeaveOneGroupOut while training 
    
    X_train_df.reset_index(drop=True,inplace=True) #resets the index, drops the old index as it will get added as a column and order is important 
    Y_train_df.reset_index(drop=True,inplace=True)
    X_test_df.reset_index(drop=True, inplace=True)
    Y_test_df.reset_index(drop=True, inplace=True)
    
    
    return X_train_df,X_test_df,Y_train_df,Y_test_df,groups



#custom_train_test_split(x=data_features_df,y=data_labels_df,split_group_size=2,random_state=123)




### below cell performs cross validation to determine which is the best feature combination from ANOVA:

In [4]:
'''this custom splitting ensures that the test set always has both the classes. This is important because otherwise accuracy
will be suffer
'''
start_time=time.time() 

X_train_val, X_test_val, Y_train_val,Y_test_val,groups=\
                       custom_train_test_split(x=data_features_df,y=data_labels_df,split_group_size=3,random_state=123)

group_cv=[x for x in range(groups)]*10
group_cv.sort()

logistic_clf=LogisticRegression()

logo_cv = LeaveOneGroupOut()

avg_accuracy_list_logistic=avg_score_list_logistic=[]

ANOVA_selected_columns=dict()

ANOVA_ranks=[x for x in range(10,433,10)]
for rank in ANOVA_ranks:
    
    ANOVA=SelectKBest(f_classif,k=rank)
    new_features_ANOVA=ANOVA.fit_transform(X_train_val, Y_train_val)
    
    ANOVA_selected_columns[rank]=ANOVA.get_support(indices=True) #keeps track of columns chosen SelectKBest in the dictionary
    
    accuracy_list=[] 
    score_list=[]
    
    for train, test in logo_cv.split(new_features_ANOVA, Y_train_val.values, group_cv):
       
        X_train_cv, X_test_cv = new_features_ANOVA[train], new_features_ANOVA[test]
        Y_train_cv, Y_test_cv = Y_train_val.values[train], Y_train_val.values[test]     
        
        logistic_clf.fit(X_train_cv,Y_train_cv)
        
        y_predict=logistic_clf.predict(X_test_cv)
        
        tp=fp=tn=fn=i=0
        
        while i < len(y_predict):
            if Y_test_cv[i]==1 and y_predict[i]==1:             #corresponding actual y=1 and prediction =1 then it is True+ve
                tp=tp+1
            elif Y_test_cv[i]==0 and y_predict[i]==1:           #corresponding actual y=0 and prediction =1 then it is False+ve
                fp=fp+1
            elif Y_test_cv[i]==0 and y_predict[i]==0:           #corresponding actual y=0 and prediction =0 then it is True-ve
                tn=tn+1
            else:                                               #corresponding actual y=1 and prediction =0 then it is False-ve
                fn=fn+1
            i+=1
            
        accuracy=(tp+tn)/(tp+tn+fp+fn)                          #accuracy formula
        accuracy_list.append(accuracy)                          #storing history data in accuracy_list
        #accuracy_scores=accuracy_score(Y_test_cv.tolist(),y_predict.tolist())
        #score_list.append(accuracy_scores)
        
    avg_accuracy_list_logistic.append(statistics.mean(accuracy_list))    #finally adding the average accuracy of each model
    #avg_score_list_logistic.append(statistics.mean(score_list))    

best_rank=ANOVA_ranks[avg_accuracy_list_logistic.index(max(avg_accuracy_list_logistic))]


print(f"The average accuracy collected at every iteration:{avg_accuracy_list_logistic}")
print("\n")
print(f"The best rank is {best_rank} at highest accuracy ({max(avg_accuracy_list_logistic):.3f}) from LOGO cross-validation ")
print("\n")

elapsed_time=time.time()-start_time

#print(f"The best rank is {best_rank_score} at highest accuracy ({max(avg_score_list_logistic):.3f}) from LOGO cross-validation ")

print("Total time taken to run ANOVA test:\n")
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
'''The accuracy formula accuracy=(tp+tn)/(tp+tn+fp+fn) or the accuracy_score(Y_test_cv.tolist(),y_predict.tolist())
give the same result, so using anyone of them'''

The average accuracy collected at every iteration:[0.74, 0.7066666666666667, 0.7666666666666667, 0.7733333333333333, 0.7733333333333333, 0.76, 0.7733333333333333, 0.7933333333333333, 0.7866666666666666, 0.78, 0.78, 0.7533333333333333, 0.7533333333333333, 0.7666666666666667, 0.78, 0.7733333333333333, 0.7666666666666667, 0.7666666666666667, 0.78, 0.7666666666666667, 0.7533333333333333, 0.76, 0.76, 0.7733333333333333, 0.7866666666666667, 0.7866666666666667, 0.7866666666666667, 0.7866666666666667, 0.78, 0.8, 0.8066666666666666, 0.8133333333333334, 0.8266666666666667, 0.8266666666666667, 0.8333333333333334, 0.8266666666666667, 0.8133333333333334, 0.8133333333333334, 0.8066666666666666, 0.8066666666666666, 0.8133333333333334, 0.8133333333333334, 0.8066666666666666]


The best rank is 350 at highest accuracy (0.833) from LOGO cross-validation 


Total time taken to run ANOVA test:

00:00:03


'The accuracy formula accuracy=(tp+tn)/(tp+tn+fp+fn) or the accuracy_score(Y_test_cv.tolist(),y_predict.tolist())\ngive the same result, so using anyone of them'

# The best feature combo is 350 in total. Moving to Mutual Infrmation