##### Imports 

In [1]:
import pandas as pd
import numpy as np
import csv
import logging
#from functions_py.mephys_funcs import read_file, merge_dataframes, filter_date, drop_cols, drop_nans, \
#create_cond_df, create_container_df, filter_df

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(context = "notebook", style = "ticks", font="verdana") # font_scale = 1.35)

##### Logging Set-up

In [2]:
logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)

##### Pandas Display Settings

In [3]:
pd.set_option("display.max_colwidth",150) #Expands the number of characters shown in the columns
pd.set_option('display.max_columns', None)

##### Lists/Paths

In [4]:
fields_jem = ["date", "organism_name", "container", "rigOperator", "status", "roi",
              "extraction.postPatch", "extraction.endPipetteR"]
fields_ephys = ["name", "vrest", "ri", "sag", 
                "tau", "upstroke_downstroke_ratio_long_square", "latency", "f_i_curve_slope"]
fields_shiny = ["patch.date", "cell_name", "sample_id", "cell_specimen_project",
                "subclass_label", "topLeaf_label", "broad_class_label", "VISp_cluster",
                "marker_sum_norm_label", "Norm_Marker_Sum.0.4_label", "Tree_call"]

In [5]:
#path_jem = "Z:/Patch-Seq/compiled-jem-data/jem_metadata.csv"
#path_shiny = "//allen/programs/celltypes/workgroups/rnaseqanalysis/shiny/patch_seq/star/mouse_patchseq_VISp_current/mapping.df.with.bp.40.lastmap.csv"

path_jem = "C:/Users/kumar/Documents/GitHub/analysis_projects/csv/jem_metadata_wFAILURE.csv"
path_ephys = "C:/Users/kumar/Documents/GitHub/analysis_projects/csv/ephys_mIVSCC_MET.csv"
path_shiny = "C:/Users/kumar/Documents/GitHub/analysis_projects/csv/Mouse_VISp_ctx_shiny.csv"

In [6]:
r_users = ["kristenh", "lindsayn", "ramr", "katherineb", "jessicat"]

In [7]:
b_colors={"RSP": "#a1d99b", "VISp": "#9ecae1"}
s_colors={"RSP": "#41ab5d", "VISp": "#4292c6"}

##### Functions

In [8]:
def read_file(path, fields=None):
    """Reads file in as pandas dataframe by using pd.read_csv
    Args:
        path: path of file location
    Return:
        df: a pandas dataframe
    """
    global df
    df = pd.read_csv(path, usecols=fields)
    LOGGER.info("Read file in as a pandas dataframe")
    return df

In [9]:
def merge_dataframes(left_df, right_df, left_col, right_col, join_how):
    """Merges two dataframes together into one dataframe
    Args:
        left_df: a pandas dataframe on the left
        right_df: a pandas dataframe on the right
        left_col: a column from the left dataframe
        right_col: a column from the right dataframe
    Return:
        merge_df: a merged pandas dataframe
    """
    merge_df = pd.merge(left = left_df,
                        right = right_df,
                        left_on = left_col,
                        right_on = right_col, 
                        how = join_how)
    LOGGER.info("Merged two pandas dataframe into one dataframe")
    return merge_df

In [10]:
def drop_cols(df, drop_col):
    """Drop unnessary columns from dataframe
    Args:
        df: a pandas dataframe
        drop_col(lst): column names to drop from dataframe
    Return:
        df: a pandas dataframe without certain columns
    """
    LOGGER.info("Dropped columns: %s", drop_col)
    df.drop(columns=drop_col, inplace=True)
    return df

In [11]:
def drop_nans(df, drop_na_col):
    """Drop Nans from selected columns
    Args:
        df: a pandas dataframe
        drop_na_col(lst): column names to drop NaNs from 
    Return:
        df: a pandas dataframe without NaNs in certain columns
    """
    LOGGER.info("Dropped NaNs from these columns: %s", drop_na_col)
    df.dropna(subset=drop_na_col, inplace=True)
    return df

In [12]:
def filter_date_range(df, date_col):
    """Filters and sorts the date column by specific date range in the dataframe.
    Args:
        df: a pandas dataframe
        date_col(string): column name with date information
    Returns:
        df: a pandas dataframe with a filtered date range
    """
    start_date = "2019-01-01"
    end_date = "2020-12-31"

    mask = (df[date_col] > start_date) & (df[date_col] <= end_date)
    df = df.loc[mask]
    df.sort_values([date_col], inplace=True)
    LOGGER.info("Filtered dataframe to only display 2019-2020 data")
    return df

In [13]:
def filter_df(df, fil_col, fil_val):
    """Creates a dataframe based on values from a single column
     Args:
        df: a pandas dataframe
        fil_col(string): column name from dataframe
        fil_val(string): values to restrict dataframe by
    Return:
        df: a pandas dataframe created by values from a single column
    """
    df = df[df[fil_col] == fil_val]
    return df

In [14]:
def create_container_col(df, col_label):
    """Creates container label based on rig operator names
    Args:
        df: a pandas dataframe
        col_label(string): a column name with the container label information
    Return:
        df: a pandas dataframe with a new column with container labels
    """
    rig_user_dictionary ={"kristenh" : "P1", "rustym": "P2", 
                          "lindsayn": "P8", 
                          "ramr": "PA", 
                          "dijonh": "PB",
                          "katherineb": "PE", 
                          "jessicat": "PF"} 

    df["patch_container_label"] = df[col_label].map(rig_user_dictionary)
    LOGGER.info("Created a patch_container_label column to show(ex.'PA')")
    return df

In [15]:
def create_cond_df(df, col, val):
    """Creates a dataframe based on values from a single column
     Args:
        df: a pandas dataframe
        col(string): column name from dataframe
        val(list): values to restrict dataframe by
    Return:
        df: a pandas dataframe created by values from a single column
    """
    df = df[df[col].str.contains("|".join(val))]
    LOGGER.info("Created a conditional dataframe based on a list of values")
    return df

##### Main

In [16]:
jem = read_file(path_jem, fields_jem) #20843 rows
ephys = read_file(path_ephys, fields_ephys) #8541 rows
shiny = read_file(path_shiny, fields_shiny) #10674 rows

jem = filter_df(jem, "status", "SUCCESS") #13325 rows
jem = filter_date_range(jem, "date") #6335 rows
shiny = filter_date_range(shiny, "patch.date") #3050 rows

merge_sj = merge_dataframes(shiny, jem, "sample_id", "container", "inner") #3051 rows (even if how=left)
merge_all = merge_dataframes(merge_sj, ephys, "cell_name", "name", "inner") #2787 rows

merge_all = create_container_col(merge_all, "rigOperator")

drop_nans_list = ["date"]
merge_all = drop_nans(merge_all, drop_nans_list)

drop_cols_list = ["sample_id", "patch.date", "status", "name", "cell_specimen_project", "organism_name"]
merge_all = drop_cols(merge_all, drop_cols_list)

merge_all.set_index("date", inplace=True)


merge_all["region"] = pd.np.where(merge_all.roi.str.contains("RSP"), "RSP", 
                                  pd.np.where(merge_all.roi.str.contains("VISp"), "VISp", 
                                  pd.np.where(merge_all.roi.str.contains("TEa"), "TEa",
                                  pd.np.where(merge_all.roi.str.contains("CTXsp"), "CLA",
                                  pd.np.where(merge_all.roi.str.contains("MO"), "MO",
                                  pd.np.where(merge_all.roi.str.contains("ORB"), "ORB",
                                  pd.np.where(merge_all.roi.str.contains("SSp"), "SSp",
                                  pd.np.where(merge_all.roi.str.contains("HY"), "HY", "Unknown"))))))))

rsp = create_cond_df(merge_all, "region", ["RSP"])
ssp = create_cond_df(merge_all, "region", ["SSp"])
orb = create_cond_df(merge_all, "region", ["ORB"])
ctxsp = create_cond_df(merge_all, "region", ["CLA"])
mo = create_cond_df(merge_all, "region", ["MO"])
visp = create_cond_df(merge_all, "region", ["VISp"])

INFO:__main__:Read file in as a pandas dataframe
INFO:__main__:Read file in as a pandas dataframe
INFO:__main__:Read file in as a pandas dataframe
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
INFO:__main__:Filtered dataframe to only display 2019-2020 data
INFO:__main__:Filtered dataframe to only display 2019-2020 data
INFO:__main__:Merged two pandas dataframe into one dataframe
INFO:__main__:Merged two pandas dataframe into one dataframe
INFO:__main__:Created a patch_container_label column to show(ex.'PA')
INFO:__main__:Dropped NaNs from these columns: ['date']
INFO:__main__:Dropped columns: ['sample_id', 'patch.date', 'status', 'name', 'cell_specimen_project', 'organism_name']
INFO:__main__:Created a conditional dataframe based on a list of values
INFO:__main__:Created a conditional dataframe based on a list of values
INFO:_

##### Writing to Excel File

In [17]:
csv_path = "C:/Users/kumar/Documents/Github/analysis_projects/csv/"
excel_path = "C:/Users/kumar/Documents/Github/analysis_projects/excel/"
plot_path = "C:/Users/kumar/Documents/Github/analysis_projects/plot/"

In [18]:
writer = pd.ExcelWriter(excel_path + "mephys_final_home.xlsx")
merge_all.to_excel(writer, "All", freeze_panes=(1,0))
merge_sj.to_excel(writer, "Shiny_Jem", freeze_panes=(1,0))
shiny.to_excel(writer, "Shiny", freeze_panes=(1,0))
jem.to_excel(writer, "Jem", freeze_panes=(1,0))
ephys.to_excel(writer, "Ephys", freeze_panes=(1,0))
writer.save()

##### Create Region Specific dfs

In [19]:
merge_all

Unnamed: 0_level_0,cell_name,Tree_call,subclass_label,broad_class_label,topLeaf_label,marker_sum_norm_label,Norm_Marker_Sum.0.4_label,VISp_cluster,container,extraction.endPipetteR,extraction.postPatch,rigOperator,roi,vrest,ri,sag,tau,upstroke_downstroke_ratio_long_square,latency,f_i_curve_slope,patch_container_label,region
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2019-01-03 11:13:33 -0800,Vip-IRES-Cre;Ai14-433569.03.01.01,Core,Vip,GABAergic,43_Vip,0.966119,True,Vip Crispld2 Kcne4,P1S4_190103_002_A01,11400.0,nucleus_present,kristenh,VISp2/3,-54.746689,266.968727,0.233464,16.772145,2.077782,0.02914,0.420388,P1,VISp
2019-01-03 11:13:33 -0800,Vip-IRES-Cre;Ai14-433569.03.01.02,Core,Vip,GABAergic,45_Vip,0.966677,True,Vip Pygm C1ql1,P1S4_190103_003_A01,4000.0,nucleus_present,kristenh,VISp2/3,-71.890958,245.312572,0.030711,14.143285,3.756587,0.02856,0.148936,P1,VISp
2019-01-03 11:27:09 -0800,Vip-IRES-Cre;Ai14-433569.04.02.01,Core,Vip,GABAergic,49_Vip,0.954358,True,Vip Lmo1 Myl1,P8S4_190103_351_A01,3000.0,nucleus_present,lindsayn,VISp2/3,-65.221985,210.875049,0.057213,10.071189,3.611655,0.01336,0.057895,P8,VISp
2019-01-04 10:15:51 -0800,Sst-IRES-Cre;Ai14-434645.05.01.01,PoorQ,L6 Car3,Glutamatergic,260_L6 Car3,0.534642,True,n15,PBS4_190104_501_A01,2000.0,nucleus_present,dijonh,VISp2/3,-64.367810,283.593744,0.130827,24.642843,1.766052,0.04120,0.396552,PB,VISp
2019-01-04 10:15:51 -0800,Sst-IRES-Cre;Ai14-434645.05.01.02,PoorQ,Sst,GABAergic,68_Sst,0.979294,True,n91,PBS4_190104_502_A01,1100.0,nucleus_present,dijonh,VISp4,-60.750129,207.499996,0.095045,21.534646,2.584531,0.03168,0.285000,PB,VISp
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-02-03 13:42:56 -0800,Penk-IRES2-Cre-neo;Slc17a6-IRES2-FlpO-508094-Ai65.08.09.02,PoorQ,Sncg,GABAergic,24_Ndnf HPF,0.533483,True,n59,P8S4_200203_355_A01,2000.0,nucleus_present,lindsayn,HYVMH,-70.219828,307.593733,0.079629,32.108982,3.848269,0.03590,0.225581,P8,HY
2020-02-03 13:42:56 -0800,Penk-IRES2-Cre-neo;Slc17a6-IRES2-FlpO-508094-Ai65.08.09.01,PoorQ,L5 ET CTX,Glutamatergic,255_L5 ET RSP-ACA,0.598285,True,n4,P8S4_200203_354_A01,2000.0,nucleus_present,lindsayn,HYVMH,-65.118483,350.093812,0.118779,23.240435,2.928350,0.02342,0.503458,P8,HY
2020-02-03 13:42:56 -0800,Penk-IRES2-Cre-neo;Slc17a6-IRES2-FlpO-508094-Ai65.08.09.03,PoorQ,L5 ET CTX,Glutamatergic,252_L5 ET CTX,0.625656,True,n59,P8S4_200203_356_A01,50.0,nucleus_present,lindsayn,HYVMH,-65.103513,342.687517,0.024383,19.791295,3.384998,0.01894,0.508028,P8,HY
2020-02-03 15:14:30 -0800,Oxtr-T2A-Cre;Ai14-508855.06.02.02,Core,Sst,GABAergic,80_Sst,1.023069,True,n91,P8S4_200203_358_A01,1800.0,nucleus_present,lindsayn,RSPv5,-63.742029,161.749989,0.142305,14.693643,1.586746,0.01774,0.041379,P8,RSP
