# Visualization - For Origin & flavo

Author: Sandra Godinho Silva \
Creation date: /23/09/2020 \
Version: 0.4

In [13]:
import os
#os.chdir("drive/Feature_selection/")
#os.chdir("/drive/My Drive/Chapter 3 - Flavobacteriaceae genomes/Data&Code/5_Visualization2/")

In [14]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt

# Functions

In [15]:
def LoadParameters():
    df = pd.read_csv("Security_evolution_origin_flavo.csv")
    return df

In [16]:
def textToList(hashtags):
    return hashtags.strip('[]').replace('\'', '').replace(' ', '').split(',')

In [17]:
def LoadingData(parameters, file, class_name):
    """
    Load original dataset (df_original), 
    extract selected features and created new df with them (df_FS).
    """
    name = file.split("/")[-1].split(".")[0]
    #name= name.replace("2", "")
    dataset = name.split("_")[0:1]
    print(name)
    df_original = pd.read_csv(file, low_memory=False)

    #df_original=df_original.groupby('Genus').filter(lambda x : len(x)>10) 
    cols = df_original.columns.to_list()

    i_dataset = parameters[parameters['Dataset']==name][parameters['Class'] ==class_].index.values[0]
    features = parameters['Selected attributes'].iloc[i_dataset]
    features = textToList(features)[:-1] # to remove class column
    
    features2 = features.copy()
    features.extend(["Origin", "Genus", "Family"])
    features2.extend(["Assembly accession","Origin", "Genus", "Family"])
    
    try:
        df_FS = df_original[features2] # Feature selected dataframe
    except:
        df_FS = df_original[features] # Feature selected dataframe
        
    return df_original, df_FS, dataset

In [18]:
def PrepareData(df):
    """ Get dataframes only  numerical data (df_matrix)
    or categorical data (df_categorical).
    """
    #df=df.groupby('Genus').filter(lambda x : len(x)>10) 
    # Numerical data
    df_matrix = df.select_dtypes(exclude=['object'])
    df_matrix = df_matrix.fillna(0)
    
    df_categorical =  df.select_dtypes(include=['object'])
    
    return df_matrix, df_categorical

In [19]:
def print_best_worst (scores):
    scores = sorted(scores, reverse = True)
    df = pd.DataFrame(index=range(1,20), columns=["ID", "Score"])    
    
    for i in range(10): #10 best hits
        df.iloc[i, 0] = scores[i][1]
        df.iloc[i, 1] = scores[i][0]
    return df.head(10)

In [20]:
def FeatureRanking(df_FS, df_matrix, df_categorical, class_, file_name):
    from sklearn.ensemble import RandomForestClassifier
    import random   
    random.seed(40)
    X = df_matrix
    y = df_categorical[class_]
    
    clf = RandomForestClassifier(n_estimators=500)
    clf.fit(X,y)

    num_features = len(X.columns)
    scores = []
    for i in range(num_features):
        scores.append((clf.feature_importances_[i],X.columns[i]))

    df_scores = print_best_worst(scores)
    if "Pfam" in file_name:
        df_rank = pd.merge(df_scores, pfam_map, how="left", left_on="ID", right_on="index")
    elif "Cog" in file_name:
        df_rank = pd.merge(df_scores, cog_map, how="left", left_on="ID", right_on="COG")
    else:
        df_rank=df_scores.copy()
    
    return df_rank

## Plot

In [21]:
def Plot(df_FS, df_rank, class_, file_path, file_name):
    from  matplotlib.ticker import PercentFormatter

    i= 0
    i = int(i)
    file_name = file_name.replace("_clan","")
    for x in df_rank.iterrows():
      x = x[1]["ID"]
      orf = df_FS[[x, "Origin"]]
      orf = orf.groupby(["Origin"]).mean().reset_index()
      orf["Origin"] = orf["Origin"].str.replace("Non_marine", "Not marine")
        
      plt.figure(figsize=(10,5))
        
      palette={"Marine":"navy","Not marine":"forestgreen"}

      g = sns.catplot(data=orf, kind="bar", 
      x='Origin', y=x, palette=palette,
      dodge=False, 
        #legend_out=True,
      height=4, #make the plot 7 units high
      aspect=2) #height is n times width

      plt.xlabel('', fontsize=15)
        
      if "PA" in file_name:
        plt.ylabel("Mean Presence", fontsize=15)
        for ax in g.axes.flat:
          ax.yaxis.set_major_formatter(PercentFormatter(1))
      elif "abund" in file_name:
          plt.ylabel("Relative abundance", fontsize=15)
      else:
          plt.ylabel("Mean counts", fontsize=15)
        
      if "Cog" in file_name:
          myTitle =  x + " - " + str(df_rank["name"].iloc[i])
      elif "Pfam" in file_name:
          myTitle =  x + " - " + str(df_rank["PFAM_desc"].iloc[i])
      else:
          myTitle = x
      
      for ax in g.axes.ravel():
        ax.set_xticklabels(ax.get_xticklabels(), horizontalalignment='right', fontsize=15, ha="center")
        ax.set_title(myTitle, fontsize=16,  loc="center", wrap=True)

      name_fig = file_path + str(x) +".png"
      plt.savefig(name_fig, bbox_inches='tight')

      i+=1 
    return plt.show()

In [22]:
def RankTable(file_name, df_rank):
    df = pd.DataFrame(data=None)
    output = file_path + "Rank_10_" + str(file_name) +".csv"

    for x in df_rank.iterrows():
          x = x[1]["ID"]
          orf = df_FS[[x, "Origin"]]
          orf = orf.groupby(["Origin"]).mean().reset_index()
          orf["Origin"] = orf["Origin"].str.replace("Non_marine", "Not marine")
          orf = orf.set_index("Origin").T 
          df = df.append(orf)
    df = df.reset_index()

    merge = pd.merge(df, df_rank, how="left", left_on="index", right_on="ID")
    merge = merge.drop(columns="ID")
    merge.head()
    print(output)
    merge.to_csv(output, index=False)

## Map Merops

In [23]:
map_merops = pd.read_csv("merops_ids.csv")
map_merops["subfamily_merops"]=map_merops["desc"].str.split("]", expand=True)[0].str.replace("[","").str.split(".", expand=True)[0]
map_merops["family_merops"]=map_merops["desc"].str.split("]", expand=True)[0].str.replace("[","").str[0]
map_merops.head()

Unnamed: 0,ID,name,organism,desc,source,subfamily_merops,family_merops
0,MER0000002,chymotrypsin A,cattle-type,,,,
1,MER0000004,chymotrypsin B,Bos taurus,[S01.152]#S01A#{peptidase unit: 16-245},CTRB_BOVIN,S01,S
2,MER0000009,proteolytic lectin,Anopheles-type,,,,
3,MER0000012,serine peptidase 6,Drosophila melanogaster,,,,
4,MER0000013,LP12217p,Drosophila melanogaster,,,,


In [24]:
df = mean.set_index("Origin").T
df = pd.merge(df, map_merops, how="left", left_index=True, right_on="ID")
df.head()

NameError: name 'mean' is not defined

# Implementation

In [27]:
import glob
import os
l=[]
files = os.listdir
l_caz_pep = []
for file in glob.glob("*"):
    if "Security" not in file and "Metadata" not in file and "All" not in file and "csv" in file and "ids" not in file:
        l.append(file)


l

['cazymes_metadata.csv',
 'cazymes_PA_metadata.csv',
 'caz_merops_metadata.csv',
 'caz_merops_PA_metadata.csv',
 'merops_metadata.csv',
 'merops_PA_metadata.csv']

In [28]:
df = LoadParameters()
df.head()

Unnamed: 0,Dataset,Class,Unnamed: 2,Beginning,After pre-processing,CfsSubsetEval,InfoGainAttributeEval,Parameter selection,Parameters,Evaluation,Training cross-validation,Test,Selected attributes
0,,,Nr Instances,,,,,Best threshold,,F-measure,,,
1,,,Nr Attributes,,,,,Best nr features,,Accuracy,,,
2,cazymes_resumed_metadata,Origin,,1256.0,1256.0,1256.0,1256.0,,0.0,,0.751435,0.723054,"['CBM77', 'PL1', 'PL10_1', '3.2.1.21', 'GT35',..."
3,cazymes_resumed_metadata,Origin,,218.0,214.0,19.0,19.0,,5.0,,75.298805,73.015873,
4,cazymes_resumed_PA_metadata,Origin,,1256.0,1256.0,1256.0,1256.0,,0.0,,0.706193,0.697677,"['CBM77', 'PL1', 'PL10_1', '3.2.1.21', 'GT3', ..."


In [29]:
class_name="Origin"
class_ = "Origin"
parameters = LoadParameters()

for file in l:
    print("========== New dataset ==========")
    print("Dataset: " + str(file) +", Class: " + str(class_name))
    
    file_name= file.split("/")[-1].split(".")[0]
    print(file_name)

    output_mean = "Mean_byOrigin/" + str(file_name) + "_mean_Origin.csv"
    output_mean_desc = "Mean_byOrigin/" + str(file_name) + "_mean_Origin_desc.csv"
    file_path = "FS_byOrigin/" + file_name + "/"
    output_FS = "Reduced_datasets/" + str(file_name) + "_FS.csv"

    from pathlib import Path
    Path(file_path).mkdir(parents=True, exist_ok=True)
    
    df_original, df_FS, dataset = LoadingData(parameters, file, class_name)
    df_FS.to_csv(output_FS, index=False)

    df_matrix, df_categorical = PrepareData(df_FS)
    
    mean = df_FS.groupby(["Origin"]).mean() #.drop(columns=["index"])
    mean = mean.reset_index()
    mean.to_csv(output_mean, index=False)
    
    df = mean.set_index("Origin").T
    df = pd.merge(df, map_merops, how="left", left_index=True, right_on="ID")
    df.to_csv(output_mean_desc, index=False)
    #df_rank = FeatureRanking(df_FS, df_matrix, df_categorical, class_, file_name)
    #print("Plot:")
    #Plot(df_FS, df_rank, class_, file_path, file_name)
    #RankTable(file_name, df_rank)


Dataset: cazymes_metadata.csv, Class: Origin
cazymes_metadata
cazymes_metadata
Dataset: cazymes_PA_metadata.csv, Class: Origin
cazymes_PA_metadata
cazymes_PA_metadata
Dataset: caz_merops_metadata.csv, Class: Origin
caz_merops_metadata
caz_merops_metadata
Dataset: caz_merops_PA_metadata.csv, Class: Origin
caz_merops_PA_metadata
caz_merops_PA_metadata
Dataset: merops_metadata.csv, Class: Origin
merops_metadata
merops_metadata
Dataset: merops_PA_metadata.csv, Class: Origin
merops_PA_metadata
merops_PA_metadata
