##### Imports 

In [1]:
import pandas as pd
import csv
import logging
#from functions_py.mephys_funcs import read_file, merge_dataframes, filter_date, drop_cols, drop_nans, \
#create_cond_df, create_container_df, filter_df

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(context = "notebook", style = "ticks", font="verdana") # font_scale = 1.35)

In [2]:
#Fix roi search so you can filter for new mouse regions

In [3]:
#Check the dfs to figure out if accurate

##### Logging Set-up

In [4]:
logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)

##### Pandas Display Settings

In [5]:
pd.set_option("display.max_colwidth",150) #Expands the number of characters shown in the columns
pd.set_option('display.max_columns', None)

##### Lists/Paths

In [6]:
fields_jem = ["date", "organism_name", "container", "rigOperator", "status", "roi",
              "extraction.postPatch", "extraction.endPipetteR"]
fields_ephys = ["name", "vrest", "ri", "sag", 
                "tau", "upstroke_downstroke_ratio_long_square", "latency", "f_i_curve_slope"]
fields_shiny = ["patch.date", "cell_name", "sample_id", "cell_specimen_project",
                "subclass_label", "topLeaf_label", "broad_class_label", "VISp_cluster",
                "marker_sum_norm_label", "Norm_Marker_Sum.0.4_label", "Tree_call"]

In [7]:
#path_jem = "Z:/Patch-Seq/compiled-jem-data/jem_metadata.csv"
#path_shiny = "//allen/programs/celltypes/workgroups/rnaseqanalysis/shiny/patch_seq/star/mouse_patchseq_VISp_current/mapping.df.with.bp.40.lastmap.csv"

path_jem = "C:/Users/kumar/Documents/GitHub/analysis_projects/csv/jem_metadata_wFAILURE.csv"
path_ephys = "C:/Users/kumar/Documents/GitHub/analysis_projects/csv/ephys_mIVSCC_MET.csv"
path_shiny = "C:/Users/kumar/Documents/GitHub/analysis_projects/csv/Mouse_VISp_ctx_shiny.csv"

In [8]:
drop_cols_list = ["name", "patched_cell_container", "cell_name_label", "sample_id"]
drop_nans_list = ["date", "name", "rigOperator"]

In [9]:
r_users = ["kristenh", "lindsayn", "ramr", "katherineb", "jessicat"]

In [10]:
b_colors={"RSP": "#a1d99b", "VISp": "#9ecae1"}
s_colors={"RSP": "#41ab5d", "VISp": "#4292c6"}

##### Functions

In [11]:
def read_file(path, fields=None):
    """Reads file in as pandas dataframe by using pd.read_csv
    Args:
        path: path of file location
    Return:
        df: a pandas dataframe
    """
    global df
    df = pd.read_csv(path, usecols=fields)
    LOGGER.info("Read file in as a pandas dataframe")
    return df

In [12]:
def merge_dataframes(left_df, right_df, left_col, right_col, join_how):
    """Merges two dataframes together into one dataframe
    Args:
        left_df: a pandas dataframe on the left
        right_df: a pandas dataframe on the right
        left_col: a column from the left dataframe
        right_col: a column from the right dataframe
    Return:
        merge_df: a merged pandas dataframe
    """
    merge_df = pd.merge(left = left_df,
                        right = right_df,
                        left_on = left_col,
                        right_on = right_col, 
                        how = join_how)
    LOGGER.info("Merged two pandas dataframe into one dataframe")
    return merge_df

In [13]:
def drop_cols(df, drop_col):
    """Drop unnessary columns from dataframe
    Args:
        df: a pandas dataframe
        drop_col(lst): column names to drop from dataframe
    Return:
        df: a pandas dataframe without certain columns
    """
    LOGGER.info("Dropped columns: %s", drop_col)
    df.drop(columns=drop_col, inplace=True)
    return df

In [14]:
def drop_nans(df, drop_na_col):
    """Drop Nans from selected columns
    Args:
        df: a pandas dataframe
        drop_na_col(lst): column names to drop NaNs from 
    Return:
        df: a pandas dataframe without NaNs in certain columns
    """
    LOGGER.info("Dropped NaNs from these columns: %s", drop_na_col)
    df.dropna(subset=drop_na_col, inplace=True)
    return df

In [15]:
def filter_date_range(df, date_col):
    """Filters and sorts the date column by specific date range in the dataframe.
    Args:
        df: a pandas dataframe
        date_col(string): column name with date information
    Returns:
        df: a pandas dataframe with a filtered date range
    """
    start_date = "2019-01-01"
    end_date = "2020-12-31"

    mask = (df[date_col] > start_date) & (df[date_col] <= end_date)
    df = df.loc[mask]
    df.sort_values([date_col], inplace=True)
    LOGGER.info("Filtered dataframe to only display 2019-2020 data")
    return df

In [16]:
def filter_df(df, fil_col, fil_val):
    """Creates a dataframe based on values from a single column
     Args:
        df: a pandas dataframe
        fil_col(string): column name from dataframe
        fil_val(string): values to restrict dataframe by
    Return:
        df: a pandas dataframe created by values from a single column
    """
    df = df[df[fil_col] == fil_val]
    return df

In [17]:
def create_container_df(df, container_col):
    """Creates container label based on original container column
    Args:
        df: a pandas dataframe
        container_col: a column name with the container label information
    Return:
        df: a pandas dataframe with a new column with container labels
    """
    df["container_label"] = df[container_col].str[0:2]
    LOGGER.info("Created a container_label column to show(ex.'PA')")
    return df

In [18]:
def create_cond_df(df, col, val):
    """Creates a dataframe based on values from a single column
     Args:
        df: a pandas dataframe
        col(string): column name from dataframe
        val(list): values to restrict dataframe by
    Return:
        df: a pandas dataframe created by values from a single column
    """
    df = df[df[col].str.contains("|".join(val))]
    LOGGER.info("Created a conditional dataframe based on a list of values")
    return df

##### Main

In [19]:
jem = read_file(path_jem, fields_jem)
ephys = read_file(path_ephys, fields_ephys)
shiny = read_file(path_shiny, fields_shiny)

INFO:__main__:Read file in as a pandas dataframe
INFO:__main__:Read file in as a pandas dataframe
INFO:__main__:Read file in as a pandas dataframe


In [20]:
shiny

Unnamed: 0,sample_id,cell_name,patch.date,cell_specimen_project,Tree_call,subclass_label,broad_class_label,topLeaf_label,marker_sum_norm_label,Norm_Marker_Sum.0.4_label,VISp_cluster
0,P6S4_170808_252_A01,Vipr2-IRES2-Cre;Slc32a1-T2A-FlpO;Ai65-338917.03.01.02,8/8/2017,T301x,Core,Meis2,GABAergic,2_Meis2,0.380674,False,Meis2 Adamts19
1,P2S4_171129_055_A01,Crh-IRES-Cre_ZJH;Sst-IRES-FlpO;Ai65-362357.04.01.02,11/29/2017,mIVSCC-MET,Core,Meis2,GABAergic,2_Meis2,0.717450,True,Meis2 Adamts19
2,P1S4_170526_001_A01,Slc32a1-IRES-Cre;Ai14-321584.04.01.01,5/26/2017,T301x,Core,Lamp5,GABAergic,3_Lamp5 Lhx6,0.506820,True,n76
3,P1S4_170621_009_A01,Slc32a1-IRES-Cre;Ai14-326815.04.02.05,6/21/2017,T301x,Core,Lamp5,GABAergic,3_Lamp5 Lhx6,0.904325,True,Lamp5 Lhx6
4,P1S4_190923_004_A01,Sncg-IRES2-FlpO-neo;Ai65F-487894.09.01.01,2019-09-23T13:25:38-07:00,mIVSCC-METx,Core,Lamp5,GABAergic,3_Lamp5 Lhx6,0.853235,True,Lamp5 Lhx6
...,...,...,...,...,...,...,...,...,...,...,...
10669,PXS4_180425_552_A01,Sst-IRES-Cre;Ai14-387688.09.06.01.02,4/25/2018,mMPATCH,PoorQ,Pvalb,GABAergic,102_Sst*,0.255656,False,n3
10670,PXS4_180425_556_A01,Sst-IRES-Cre;Ai14-387688.09.06.01.07,4/25/2018,mMPATCH,PoorQ,CR,Glutamatergic,1_CR,0.319404,False,n3
10671,PXS4_180606_154_A01,Ntsr1-Cre_GN220;Ai140;Sst-IRES-FlpO-394919-Ai65F.09.06.01.05,6/6/2018,mMPATCH,PoorQ,Sst,GABAergic,68_Sst,0.081369,False,n3
10672,PXS4_180724_903_A01,Vip-IRES-Cre;Ai14-403931.02.01.02,2018-07-24 11:59:12 -07:00,mIVSCC-MET,PoorQ,L6b CTX,Glutamatergic,338_L6b CTX,0.165219,False,n3


In [21]:
jem = filter_df(jem, "status", "SUCCESS")
jem = filter_date_range(jem, "date")
shiny = filter_date_range(shiny, "patch.date")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
INFO:__main__:Filtered dataframe to only display 2019-2020 data
INFO:__main__:Filtered dataframe to only display 2019-2020 data


In [22]:
merge_sj = merge_dataframes(shiny, jem, "sample_id", "container", "left")

INFO:__main__:Merged two pandas dataframe into one dataframe


In [23]:
#merge_sj = filter_date_range(merge_sj)

In [24]:
merge_all = merge_dataframes(merge_sj, ephys, "cell_name", "name", "inner")

INFO:__main__:Merged two pandas dataframe into one dataframe


In [25]:
#merge_all = filter_date_range(merge_all)

In [26]:
merge_all = create_container_df(merge_all, "container")

INFO:__main__:Created a container_label column to show(ex.'PA')


In [27]:
merge_all.container_label.unique()

array(['P1', 'P8', 'PB', 'PF', 'PA', 'PE', 'P2'], dtype=object)

In [28]:
merge_all

Unnamed: 0,sample_id,cell_name,patch.date,cell_specimen_project,Tree_call,subclass_label,broad_class_label,topLeaf_label,marker_sum_norm_label,Norm_Marker_Sum.0.4_label,VISp_cluster,container,date,extraction.endPipetteR,extraction.postPatch,rigOperator,roi,status,organism_name,name,vrest,ri,sag,tau,upstroke_downstroke_ratio_long_square,latency,f_i_curve_slope,container_label
0,P1S4_190103_002_A01,Vip-IRES-Cre;Ai14-433569.03.01.01,2019-01-03T11:13:33-08:00,mIVSCC-MET,Core,Vip,GABAergic,43_Vip,0.966119,True,Vip Crispld2 Kcne4,P1S4_190103_002_A01,2019-01-03 11:13:33 -0800,11400.0,nucleus_present,kristenh,VISp2/3,SUCCESS,Mouse,Vip-IRES-Cre;Ai14-433569.03.01.01,-54.746689,266.968727,0.233464,16.772145,2.077782,0.02914,0.420388,P1
1,P1S4_190103_003_A01,Vip-IRES-Cre;Ai14-433569.03.01.02,2019-01-03T11:13:33-08:00,mIVSCC-MET,Core,Vip,GABAergic,45_Vip,0.966677,True,Vip Pygm C1ql1,P1S4_190103_003_A01,2019-01-03 11:13:33 -0800,4000.0,nucleus_present,kristenh,VISp2/3,SUCCESS,Mouse,Vip-IRES-Cre;Ai14-433569.03.01.02,-71.890958,245.312572,0.030711,14.143285,3.756587,0.02856,0.148936,P1
2,P8S4_190103_351_A01,Vip-IRES-Cre;Ai14-433569.04.02.01,2019-01-03T11:27:09-08:00,mIVSCC-MET,Core,Vip,GABAergic,49_Vip,0.954358,True,Vip Lmo1 Myl1,P8S4_190103_351_A01,2019-01-03 11:27:09 -0800,3000.0,nucleus_present,lindsayn,VISp2/3,SUCCESS,Mouse,Vip-IRES-Cre;Ai14-433569.04.02.01,-65.221985,210.875049,0.057213,10.071189,3.611655,0.01336,0.057895,P8
3,PBS4_190104_501_A01,Sst-IRES-Cre;Ai14-434645.05.01.01,2019-01-04 10:15:51 -08:00,mIVSCC-MET,PoorQ,L6 Car3,Glutamatergic,260_L6 Car3,0.534642,True,n15,PBS4_190104_501_A01,2019-01-04 10:15:51 -0800,2000.0,nucleus_present,dijonh,VISp2/3,SUCCESS,Mouse,Sst-IRES-Cre;Ai14-434645.05.01.01,-64.367810,283.593744,0.130827,24.642843,1.766052,0.04120,0.396552,PB
4,PBS4_190104_502_A01,Sst-IRES-Cre;Ai14-434645.05.01.02,2019-01-04 10:15:51 -08:00,mIVSCC-MET,PoorQ,Sst,GABAergic,68_Sst,0.979294,True,n91,PBS4_190104_502_A01,2019-01-04 10:15:51 -0800,1100.0,nucleus_present,dijonh,VISp4,SUCCESS,Mouse,Sst-IRES-Cre;Ai14-434645.05.01.02,-60.750129,207.499996,0.095045,21.534646,2.584531,0.03168,0.285000,PB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2782,P8S4_200203_355_A01,Penk-IRES2-Cre-neo;Slc17a6-IRES2-FlpO-508094-Ai65.08.09.02,2020-02-03T13:42:56-08:00,mIVSCC-MET,PoorQ,Sncg,GABAergic,24_Ndnf HPF,0.533483,True,n59,P8S4_200203_355_A01,2020-02-03 13:42:56 -0800,2000.0,nucleus_present,lindsayn,HYVMH,SUCCESS,Mouse,Penk-IRES2-Cre-neo;Slc17a6-IRES2-FlpO-508094-Ai65.08.09.02,-70.219828,307.593733,0.079629,32.108982,3.848269,0.03590,0.225581,P8
2783,P8S4_200203_354_A01,Penk-IRES2-Cre-neo;Slc17a6-IRES2-FlpO-508094-Ai65.08.09.01,2020-02-03T13:42:56-08:00,mIVSCC-MET,PoorQ,L5 ET CTX,Glutamatergic,255_L5 ET RSP-ACA,0.598285,True,n4,P8S4_200203_354_A01,2020-02-03 13:42:56 -0800,2000.0,nucleus_present,lindsayn,HYVMH,SUCCESS,Mouse,Penk-IRES2-Cre-neo;Slc17a6-IRES2-FlpO-508094-Ai65.08.09.01,-65.118483,350.093812,0.118779,23.240435,2.928350,0.02342,0.503458,P8
2784,P8S4_200203_356_A01,Penk-IRES2-Cre-neo;Slc17a6-IRES2-FlpO-508094-Ai65.08.09.03,2020-02-03T13:42:56-08:00,mIVSCC-MET,PoorQ,L5 ET CTX,Glutamatergic,252_L5 ET CTX,0.625656,True,n59,P8S4_200203_356_A01,2020-02-03 13:42:56 -0800,50.0,nucleus_present,lindsayn,HYVMH,SUCCESS,Mouse,Penk-IRES2-Cre-neo;Slc17a6-IRES2-FlpO-508094-Ai65.08.09.03,-65.103513,342.687517,0.024383,19.791295,3.384998,0.01894,0.508028,P8
2785,P8S4_200203_358_A01,Oxtr-T2A-Cre;Ai14-508855.06.02.02,2020-02-03T15:14:30-08:00,mIVSCC-MET,Core,Sst,GABAergic,80_Sst,1.023069,True,n91,P8S4_200203_358_A01,2020-02-03 15:14:30 -0800,1800.0,nucleus_present,lindsayn,RSPv5,SUCCESS,Mouse,Oxtr-T2A-Cre;Ai14-508855.06.02.02,-63.742029,161.749989,0.142305,14.693643,1.586746,0.01774,0.041379,P8


In [29]:
csv_path = "C:/Users/kumar/Documents/Github/analysis_projects/csv/"
excel_path = "C:/Users/kumar/Documents/Github/analysis_projects/excel/"
plot_path = "C:/Users/kumar/Documents/Github/analysis_projects/plot/"

In [30]:
writer = pd.ExcelWriter(excel_path + "mephys_final_home.xlsx")
merge_all.to_excel(writer, "All", freeze_panes=(1,0))
merge_sj.to_excel(writer, "Shiny_Jem", freeze_panes=(1,0))
shiny.to_excel(writer, "Shiny", freeze_panes=(1,0))
jem.to_excel(writer, "Jem", freeze_panes=(1,0))
ephys.to_excel(writer, "Ephys", freeze_panes=(1,0))
writer.save()

In [31]:
rsp = create_cond_df(merge_all, "roi_major", ["RSPd", "RSPv"])
ssp = create_cond_df(merge_all, "roi_major", ["SSp"])
orb = create_cond_df(merge_all, "roi_major", ["ORB"])
ctxsp = create_cond_df(merge_all, "roi_major", ["CTXsp"])
mo = create_cond_df(merge_all, "roi_major", ["MOp", "MOs"])
visp = create_cond_df(merge_all, "roi_major", ["VISp"])

KeyError: 'roi_major'