In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from pandas import ExcelWriter
from sklearn.ensemble import RandomForestClassifier
import openpyxl

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def process_data(data_train,data_val,cv):  
    x_column_list = data_train.drop(columns=['y_b']).columns  
    percent_label=[round(100*len(np.where(data_train['y_b']==0)[0])/len(data_train)),round(100*len(np.where(data_val['y_b']==0)[0])/len(data_val))]  
    #Classification RF  
    pipeRF = Pipeline([('classifier', [RandomForestClassifier()])])  
    param_grid = [  
    {'classifier' : [RandomForestClassifier()], 
    'classifier__n_estimators': [100, 200],
    'classifier__min_samples_split': [8, 10],
    'classifier__min_samples_leaf': [3, 4, 5],
     'classifier__max_depth': [80, 90],
    'classifier__criterion':('gini','entropy'),  
    'classifier__class_weight':('balanced','auto')}]  
    clf = GridSearchCV(pipeRF, param_grid = param_grid, cv = cv, n_jobs=-1, scoring='f1_weighted')  
    # Fit on data  
    clf.fit(data_train[x_column_list],data_train['y_b'])  
    best_clf=clf.best_estimator_  
    y_valid=best_clf.predict(data_val[x_column_list])  
    report_All = classification_report(data_val['y_b'],y_valid,output_dict=True)  
    dAll=pd.DataFrame(report_All).transpose()  
    return dAll

In [4]:
cv=RepeatedKFold(n_splits=10,n_repeats=3, random_state=100)

In [13]:
data_path2 ='/Users/rosa/Desktop/ALLWork/Madison/Project/Soil-nn/Code/python code local/Main Data Files/disease_response'

In [14]:
path_response='/Users/rosa/Desktop/ALLWork/Madison/Project/Soil-nn/Code/python code local/Main Data Files/disease_response/'  

In [7]:
path_list2 = []
#reading files in folder response
for root, dirs, files in os.walk(data_path2, topdown=False):
    for path in dirs:
        path_list2.append(path)
#reading sheet name       
wb = openpyxl.load_workbook(data_path2+'/'+path+'/feature_selection.xlsx')
sheet_list = wb.sheetnames

results_dic = dict.fromkeys(sheet_list)

for sheet_name in results_dic.keys():
    temp_df = pd.DataFrame(columns=path_list2, index=range(0,700))
    for folder in path_list2:
        data_temp = pd.read_excel(data_path2+'/'+folder+'/feature_selection.xlsx', sheet_name=sheet_name)
        temp_df[folder].iloc[
            range(0, len(data_temp['Unnamed: 0'].values))] = data_temp['Unnamed: 0'].values

    results_dic[sheet_name] = temp_df
    
for key in results_dic.keys():
    results_dic[key] = results_dic[key].iloc[range(0,max(30, round(len(data_temp)/3)))]

In [8]:
data_path = '/Users/rosa/Desktop/ALLWork/Madison/Project/Soil-nn/Code/python code local/Main Data Files/response_netcomi'

path_list = []

for root, dirs, files in os.walk(data_path, topdown=False):
    for path in dirs:
        path_list.append(path)

In [9]:
list_level = ['Class', 'Family', 'Genus', 'Order', 'Phylum']

all_data = dict.fromkeys(dirs)

for folder in path_list:
    
    all_data[folder] = dict.fromkeys(list_level)
    temp_path = data_path+'/'+folder
    temp_file_list = os.listdir(temp_path)
    for file in temp_file_list:
        if 'Count' in file:
            level = file.split('_')[-1][:-4]
            data_temp = pd.read_csv(data_path+'/'+folder+'/'+file,index_col=0)
            data_temp.sort_values(by='dif_close', ascending=False, inplace=True)
            #all_data[folder][level] = data_temp.iloc[range(0,max(40, round(len(data_temp)/3)))]
            all_data[folder][level] = data_temp.iloc[range(0,max(30, round(len(data_temp)/3)))]
            #print(level,":",max(30 ,round(len(data_temp)/3)))

print('Done.')

Done.


In [11]:
list_level = ['Class', 'Family', 'Genus', 'Order', 'Phylum']
SELECTED_FEATURE = dict.fromkeys(dirs)
writer= pd.ExcelWriter(path_response+'important_features_score'+'.xlsx', engine='xlsxwriter') 
for col in range(0,len(list_level)):
    level = list_level[col]
    print(level)
    SELECTED_FEATURE[level] = dict.fromkeys(dirs)
    response_list = path_list
    feature_list = []
    for response in response_list:
        feature_list.append(all_data[response][level].index.union(results_dic[level][response][results_dic[level][response].notnull()].values))
    feature_list = [item for subitem in feature_list for item in subitem]
    feature_list = np.unique(feature_list)
    print(len(feature_list))
    matrix_df = pd.DataFrame(columns = response_list, index = feature_list)
    for response in response_list:
        for feature in feature_list:
                if (feature in all_data[response][level].index) & (feature in results_dic[level][response].values):
                    matrix_df[response].loc[feature] = 3
                elif(feature in all_data[response][level].index) &(feature not in results_dic[level][response].values):
                    matrix_df[response].loc[feature] = 2##NetComi
                elif (feature not in all_data[response][level].index) &(feature in results_dic[level][response].values):
                    matrix_df[response].loc[feature] = 1##ML
                else:
                    matrix_df[response].loc[feature] = 0##NotMLnotNetcomi
            
        #print(matrix_df[response])
#         ree=re[matrix_df[response]==3]
#         print(ree)
#         SELECTED_FEATURE[response][level]= ree.index     
        SELECTED_FEATURE[level][response] = matrix_df[response][matrix_df[response]==3].index
        print(SELECTED_FEATURE[level][response].shape[0])
    matrix_df['Sum'] = 0
    for index in matrix_df.index:
        matrix_df['Sum'].loc[index] = sum(matrix_df.loc[index].values)
    
    matrix_df.sort_values(by='Sum', ascending=False, inplace=True)
    matrix_df.to_excel(writer, sheet_name=level, index=True)
writer.save()     
    #matrix_df=matrix_df.iloc[1:30,:]
  

Class
78
7
8
8
Family
145
9
8
9
Genus
239
5
8
12
Order
132
9
6
10
Phylum
41
22
20
22


In [None]:
for file_response in os.listdir(path_response):  
    if (file_response != '.DS_Store') & (file_response != 'Icon\r'):  
        print(file_response)  
        path_r= path_response+file_response  
        os.chdir(path_r)  
        for re in os.listdir(path_r):  
            if re[0:8] == 'response':  
                response = pd.read_csv(path_response+file_response+'/'+re)  
                response.rename(columns={'Column1':'Link_ID',response.columns[1]:'y_b'}, inplace=True)  
                #response.drop(columns=response.columns[2], inplace=True)  
                #response=response.drop(columns='Variety2')  
                path_x = '/Users/rosa/Desktop/ALLWork/Madison/Project/Soil-nn/Code/python code local/Main Data Files/normalized_data_sklearn/'  
                writer= pd.ExcelWriter(path_r+'/'+'classification_RF_FS'+'.xlsx', engine='xlsxwriter')   
                for file_folder in os.listdir(path_x):  
                    if (file_folder[-4:] != '.csv') & (file_folder != '.DS_Store')& (file_folder != 'Icon\r'):          
                        path = path_x+file_folder  
                        os.chdir(path)  
                        file_list = []  
                        tRF=pd.DataFrame()  
                        tcluster=pd.DataFrame()  
                        k=0  
                        for file in os.listdir(path):  
                            if (file[0] != 't') & (file[-4:] == '.csv') & (file != '.DS_Store')& (file_folder != 'Icon\r'):  
                                print(file)  
                                file_list.append(file)  
                                data_temp = pd.read_csv(file) 
                                data_temp.rename(columns={'Unnamed: 0':'Link_ID'}, inplace=True) 
                                A = [item for item in SELECTED_FEATURE[file_folder][file_response]]
                                A.insert(0, "Link_ID")
                                data_temp = data_temp[A]
                                data_temp.rename(columns={'Unnamed: 0':'Link_ID'}, inplace=True)  
                                data=pd.merge(response,data_temp,on='Link_ID')
                                data.drop(columns = 'Link_ID',inplace=True)  
                                data_train,data_val = train_test_split(data,train_size=0.8, random_state=42)  
                                output = process_data(data_train,data_val,cv)  
                                tRF[k]=pd.DataFrame(output['f1-score'].values)         
                                k=k+1  
                        tRF.to_excel(writer, sheet_name=file_folder, index=True)  
                writer.save()                      

no_tuber_scab
 1 _ 3 .csv
 4 _ 1 .csv
 5 _ 1 .csv
 1 _ 2 .csv
 1 _ 4 .csv
 3 _ 1 .csv
 2 _ 1 .csv
 2 _ 2 .csv
 3 _ 2 .csv
 4 _ 4 .csv
 5 _ 4 .csv
 2 _ 3 .csv
 3 _ 3 .csv
 1 _ 1 .csv
 3 _ 4 .csv
 2 _ 4 .csv
 5 _ 3 .csv
 4 _ 3 .csv
 5 _ 2 .csv
 4 _ 2 .csv
 1 _ 3 .csv
 4 _ 1 .csv
 5 _ 1 .csv
 1 _ 2 .csv
 1 _ 4 .csv
 3 _ 1 .csv
 2 _ 1 .csv
 2 _ 2 .csv
 3 _ 2 .csv
 4 _ 4 .csv
 5 _ 4 .csv
 2 _ 3 .csv
 3 _ 3 .csv
 1 _ 1 .csv
 3 _ 4 .csv
 2 _ 4 .csv
 5 _ 3 .csv
 4 _ 3 .csv
 5 _ 2 .csv
 4 _ 2 .csv
 1 _ 3 .csv
 4 _ 1 .csv
 5 _ 1 .csv
 1 _ 2 .csv
 1 _ 4 .csv
 3 _ 1 .csv
 2 _ 1 .csv
 2 _ 2 .csv
 3 _ 2 .csv
 4 _ 4 .csv
 5 _ 4 .csv
 2 _ 3 .csv
 3 _ 3 .csv
 1 _ 1 .csv
 3 _ 4 .csv
 2 _ 4 .csv
 5 _ 3 .csv
 4 _ 3 .csv
 5 _ 2 .csv
 4 _ 2 .csv
 1 _ 3 .csv
 4 _ 1 .csv
 5 _ 1 .csv
 1 _ 2 .csv
 1 _ 4 .csv
 3 _ 1 .csv
 2 _ 1 .csv
 2 _ 2 .csv
 3 _ 2 .csv
 4 _ 4 .csv
 5 _ 4 .csv
 2 _ 3 .csv
 3 _ 3 .csv
 1 _ 1 .csv
 3 _ 4 .csv
 2 _ 4 .csv
 5 _ 3 .csv
 4 _ 3 .csv
 5 _ 2 .csv
 4 _ 2 .csv
 1 _ 3 .csv
 4 _ 1 .csv
 5

In [24]:
list_level = ['Class', 'Family', 'Genus', 'Order', 'Phylum']
SELECTED_FEATURE = dict.fromkeys(dirs)
SELECTED_FEATURE_0 = dict.fromkeys(dirs)
for col in range(0,len(list_level)):
    level = list_level[col]
    print(level)
    SELECTED_FEATURE[level] = dict.fromkeys(dirs)
    SELECTED_FEATURE_0[level] = dict.fromkeys(dirs)
    response_list = path_list
    feature_list = []
    for response in response_list:
        feature_list.append(all_data[response][level].index.union(results_dic[level][response][results_dic[level][response].notnull()].values))
    feature_list = [item for subitem in feature_list for item in subitem]
    feature_list = np.unique(feature_list)
    print(len(feature_list))
    matrix_df = pd.DataFrame(columns = response_list, index = feature_list)
    for response in response_list:
        for feature in feature_list:
                if (feature in all_data[response][level].index) & (feature in results_dic[level][response].values):
                    matrix_df[response].loc[feature] = 3
                elif(feature in all_data[response][level].index) &(feature not in results_dic[level][response].values):
                    matrix_df[response].loc[feature] = 2##NetComi
                elif (feature not in all_data[response][level].index) &(feature in results_dic[level][response].values):
                    matrix_df[response].loc[feature] = 1##ML
                else:
                    matrix_df[response].loc[feature] = 0##NotMLnotNetcomi
            
        #print(matrix_df[response])
#         ree=re[matrix_df[response]==3]
#         print(ree)
#         SELECTED_FEATURE[response][level]= ree.index     
        SELECTED_FEATURE[level][response] = matrix_df[response][matrix_df[response]==3].index
        print(SELECTED_FEATURE[level][response].shape[0])
        SELECTED_FEATURE_0[level][response] = matrix_df[response][matrix_df[response]==0].index[0:SELECTED_FEATURE[level][response].shape[0]]
        #print(SELECTED_FEATURE_0[level][response].shape[0])
    
    matrix_df['Sum'] = 0
    for index in matrix_df.index:
        matrix_df['Sum'].loc[index] = sum(matrix_df.loc[index].values)
    
    matrix_df.sort_values(by='Sum', ascending=False, inplace=True)
    #matrix_df=matrix_df.iloc[1:30,:]
  

Class
93
8
12
11
9
10
10
10
10
7
Family
186
11
10
13
13
14
7
13
10
8
Genus
306
7
11
12
11
11
10
9
11
12
Order
168
11
14
9
11
12
8
11
10
10
Phylum
42
21
18
21
19
20
20
21
21
21


In [None]:
path_response='/Users/rosa/Desktop/ALLWork/Madison/Project/Soil-nn/Code/python code local/Main Data Files/response_sklearn/'  
for file_response in os.listdir(path_response):  
    if (file_response != '.DS_Store') & (file_response != 'Icon\r'):  
        print(file_response)  
        path_r= path_response+file_response  
        os.chdir(path_r)  
        for re in os.listdir(path_r):  
            if re[0:8] == 'response':  
                response = pd.read_csv(path_response+file_response+'/'+re)  
                response.rename(columns={'Column1':'Link_ID','x1':'y_b'}, inplace=True)  
                response.drop(columns=response.columns[2], inplace=True)  
                response=response.drop(columns='Variety2')  
                path_x = '/Users/rosa/Desktop/ALLWork/Madison/Project/Soil-nn/Code/python code local/Main Data Files/normalized_data_sklearn/'  
                writer= pd.ExcelWriter(path_r+'/'+'classification_RF_FS_notImportant'+'.xlsx', engine='xlsxwriter')   
                for file_folder in os.listdir(path_x):  
                    if (file_folder[-4:] != '.csv') & (file_folder != '.DS_Store')& (file_folder != 'Icon\r'):          
                        path = path_x+file_folder  
                        os.chdir(path)  
                        file_list = []  
                        tRF=pd.DataFrame()  
                        tcluster=pd.DataFrame()  
                        k=0  
                        for file in os.listdir(path):  
                            if (file[0] != 't') & (file[-4:] == '.csv') & (file != '.DS_Store')& (file_folder != 'Icon\r'):  
                                print(file)  
                                file_list.append(file)  
                                data_temp = pd.read_csv(file) 
                                data_temp.rename(columns={'Unnamed: 0':'Link_ID'}, inplace=True) 
                                A = [item for item in SELECTED_FEATURE_0[file_folder][file_response]]
                                A.insert(0, "Link_ID")
                                data_temp = data_temp[A]
                                data_temp.rename(columns={'Unnamed: 0':'Link_ID'}, inplace=True)  
                                data=pd.merge(response,data_temp,on='Link_ID')
                                
                                data.drop(columns = 'Link_ID',inplace=True)  
                                data_train,data_val = train_test_split(data,train_size=0.8, random_state=42)  
                                output = process_data(data_train,data_val,cv)  
                                tRF[k]=pd.DataFrame(output['f1-score'].values)         
                                k=k+1  
                        tRF.to_excel(writer, sheet_name=file_folder, index=True)  
                writer.save()                      

yield_per_meter
 1 _ 3 .csv
 4 _ 1 .csv
 5 _ 1 .csv
 1 _ 2 .csv
 1 _ 4 .csv
 3 _ 1 .csv
 2 _ 1 .csv
 2 _ 2 .csv
 3 _ 2 .csv
 4 _ 4 .csv
 5 _ 4 .csv
 2 _ 3 .csv
 3 _ 3 .csv
 1 _ 1 .csv
 3 _ 4 .csv
 2 _ 4 .csv
 5 _ 3 .csv
 4 _ 3 .csv
 5 _ 2 .csv
 4 _ 2 .csv
 1 _ 3 .csv
 4 _ 1 .csv
 5 _ 1 .csv
 1 _ 2 .csv
 1 _ 4 .csv
 3 _ 1 .csv
 2 _ 1 .csv
 2 _ 2 .csv
 3 _ 2 .csv
 4 _ 4 .csv
 5 _ 4 .csv
 2 _ 3 .csv
 3 _ 3 .csv
 1 _ 1 .csv
 3 _ 4 .csv
 2 _ 4 .csv
 5 _ 3 .csv
 4 _ 3 .csv
 5 _ 2 .csv
 4 _ 2 .csv
 1 _ 3 .csv
 4 _ 1 .csv
 5 _ 1 .csv
 1 _ 2 .csv
 1 _ 4 .csv
 3 _ 1 .csv
 2 _ 1 .csv
 2 _ 2 .csv
 3 _ 2 .csv
 4 _ 4 .csv
 5 _ 4 .csv
 2 _ 3 .csv
 3 _ 3 .csv
 1 _ 1 .csv
 3 _ 4 .csv
 2 _ 4 .csv
 5 _ 3 .csv
 4 _ 3 .csv
 5 _ 2 .csv
 4 _ 2 .csv
 1 _ 3 .csv
 4 _ 1 .csv
 5 _ 1 .csv
 1 _ 2 .csv
 1 _ 4 .csv
 3 _ 1 .csv
 2 _ 1 .csv
 2 _ 2 .csv
 3 _ 2 .csv
 4 _ 4 .csv
 5 _ 4 .csv
 2 _ 3 .csv
 3 _ 3 .csv
 1 _ 1 .csv
 3 _ 4 .csv
 2 _ 4 .csv
 5 _ 3 .csv
 4 _ 3 .csv
 5 _ 2 .csv
 4 _ 2 .csv
 1 _ 3 .csv
 4 _ 1 .csv
