##### Imports 

In [1]:
import pandas as pd
import csv
import logging
#from functions_py.mephys_funcs import read_file, merge_dataframes, filter_date, drop_cols, drop_nans, \
#create_cond_df, create_container_df, filter_df

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(context = "notebook", style = "ticks", font="verdana") # font_scale = 1.35)

##### Logging Set-up

In [4]:
logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)

##### Pandas Display Settings

In [5]:
pd.set_option("display.max_colwidth",150) #Expands the number of characters shown in the columns
pd.set_option('display.max_columns', None)

##### Lists/Paths

In [6]:
fields_jem = ["date", "organism_name", "container", "rigOperator", "status", "roi",
              "extraction.postPatch", "extraction.endPipetteR"]
fields_ephys = ["name", "vrest", "ri", "sag", 
                "tau", "upstroke_downstroke_ratio_long_square", "latency", "f_i_curve_slope"]
fields_shiny = ["patch.date", "cell_name", "sample_id", "cell_specimen_project",
                "subclass_label", "topLeaf_label", "broad_class_label", "VISp_cluster",
                "marker_sum_norm_label", "Norm_Marker_Sum.0.4_label", "Tree_call"]

In [7]:
#path_jem = "Z:/Patch-Seq/compiled-jem-data/jem_metadata.csv"
#path_shiny = "//allen/programs/celltypes/workgroups/rnaseqanalysis/shiny/patch_seq/star/mouse_patchseq_VISp_current/mapping.df.with.bp.40.lastmap.csv"

path_jem = "C:/Users/kumar/Documents/GitHub/analysis_projects/csv/jem_metadata_wFAILURE.csv"
path_ephys = "C:/Users/kumar/Documents/GitHub/analysis_projects/csv/ephys_mIVSCC_MET.csv"
path_shiny = "C:/Users/kumar/Documents/GitHub/analysis_projects/csv/Mouse_VISp_ctx_shiny.csv"

In [8]:
r_users = ["kristenh", "lindsayn", "ramr", "katherineb", "jessicat"]

In [9]:
b_colors={"RSP": "#a1d99b", "VISp": "#9ecae1"}
s_colors={"RSP": "#41ab5d", "VISp": "#4292c6"}

##### Functions

In [10]:
def read_file(path, fields=None):
    """Reads file in as pandas dataframe by using pd.read_csv
    Args:
        path: path of file location
    Return:
        df: a pandas dataframe
    """
    global df
    df = pd.read_csv(path, usecols=fields)
    LOGGER.info("Read file in as a pandas dataframe")
    return df

In [11]:
def merge_dataframes(left_df, right_df, left_col, right_col, join_how):
    """Merges two dataframes together into one dataframe
    Args:
        left_df: a pandas dataframe on the left
        right_df: a pandas dataframe on the right
        left_col: a column from the left dataframe
        right_col: a column from the right dataframe
    Return:
        merge_df: a merged pandas dataframe
    """
    merge_df = pd.merge(left = left_df,
                        right = right_df,
                        left_on = left_col,
                        right_on = right_col, 
                        how = join_how)
    LOGGER.info("Merged two pandas dataframe into one dataframe")
    return merge_df

In [12]:
def drop_cols(df, drop_col):
    """Drop unnessary columns from dataframe
    Args:
        df: a pandas dataframe
        drop_col(lst): column names to drop from dataframe
    Return:
        df: a pandas dataframe without certain columns
    """
    LOGGER.info("Dropped columns: %s", drop_col)
    df.drop(columns=drop_col, inplace=True)
    return df

In [13]:
def drop_nans(df, drop_na_col):
    """Drop Nans from selected columns
    Args:
        df: a pandas dataframe
        drop_na_col(lst): column names to drop NaNs from 
    Return:
        df: a pandas dataframe without NaNs in certain columns
    """
    LOGGER.info("Dropped NaNs from these columns: %s", drop_na_col)
    df.dropna(subset=drop_na_col, inplace=True)
    return df

In [14]:
def filter_date_range(df, date_col):
    """Filters and sorts the date column by specific date range in the dataframe.
    Args:
        df: a pandas dataframe
        date_col(string): column name with date information
    Returns:
        df: a pandas dataframe with a filtered date range
    """
    start_date = "2019-01-01"
    end_date = "2020-12-31"

    mask = (df[date_col] > start_date) & (df[date_col] <= end_date)
    df = df.loc[mask]
    df.sort_values([date_col], inplace=True)
    LOGGER.info("Filtered dataframe to only display 2019-2020 data")
    return df

In [15]:
def filter_df(df, fil_col, fil_val):
    """Creates a dataframe based on values from a single column
     Args:
        df: a pandas dataframe
        fil_col(string): column name from dataframe
        fil_val(string): values to restrict dataframe by
    Return:
        df: a pandas dataframe created by values from a single column
    """
    df = df[df[fil_col] == fil_val]
    return df

In [16]:
def create_container_df(df, container_col):
    """Creates container label based on original container column
    Args:
        df: a pandas dataframe
        container_col: a column name with the container label information
    Return:
        df: a pandas dataframe with a new column with container labels
    """
    df["container_label"] = df[container_col].str[0:2]
    LOGGER.info("Created a container_label column to show(ex.'PA')")
    return df

In [17]:
def create_cond_df(df, col, val):
    """Creates a dataframe based on values from a single column
     Args:
        df: a pandas dataframe
        col(string): column name from dataframe
        val(list): values to restrict dataframe by
    Return:
        df: a pandas dataframe created by values from a single column
    """
    df = df[df[col].str.contains("|".join(val))]
    LOGGER.info("Created a conditional dataframe based on a list of values")
    return df

##### Main

In [20]:
jem = read_file(path_jem, fields_jem) #20843 rows
ephys = read_file(path_ephys, fields_ephys) #8541 rows
shiny = read_file(path_shiny, fields_shiny) #10674 rows

jem = filter_df(jem, "status", "SUCCESS") #13325 rows
jem = filter_date_range(jem, "date") #6335 rows
shiny = filter_date_range(shiny, "patch.date") #3050 rows

merge_sj = merge_dataframes(shiny, jem, "sample_id", "container", "inner") #3051 rows (even if how=left)
merge_all = merge_dataframes(merge_sj, ephys, "cell_name", "name", "inner") #2787 rows

merge_all = create_container_df(merge_all, "container")

drop_nans_list = ["date"]
merge_all = drop_nans(merge_all, drop_nans_list)

drop_cols_list = ["sample_id", "patch.date", "status", "name", "cell_specimen_project", "organism_name"]
merge_all = drop_cols(merge_all, drop_cols_list)

merge_all.set_index("date", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
INFO:__main__:Filtered dataframe to only display 2019-2020 data
INFO:__main__:Filtered dataframe to only display 2019-2020 data


##### Writing to Excel File

In [29]:
csv_path = "C:/Users/kumar/Documents/Github/analysis_projects/csv/"
excel_path = "C:/Users/kumar/Documents/Github/analysis_projects/excel/"
plot_path = "C:/Users/kumar/Documents/Github/analysis_projects/plot/"

In [30]:
writer = pd.ExcelWriter(excel_path + "mephys_final_home.xlsx")
merge_all.to_excel(writer, "All", freeze_panes=(1,0))
merge_sj.to_excel(writer, "Shiny_Jem", freeze_panes=(1,0))
shiny.to_excel(writer, "Shiny", freeze_panes=(1,0))
jem.to_excel(writer, "Jem", freeze_panes=(1,0))
ephys.to_excel(writer, "Ephys", freeze_panes=(1,0))
writer.save()

##### Create Region Specific dfs

In [31]:
rsp = create_cond_df(merge_all, "roi_major", ["RSPd", "RSPv"])
ssp = create_cond_df(merge_all, "roi_major", ["SSp"])
orb = create_cond_df(merge_all, "roi_major", ["ORB"])
ctxsp = create_cond_df(merge_all, "roi_major", ["CTXsp"])
mo = create_cond_df(merge_all, "roi_major", ["MOp", "MOs"])
visp = create_cond_df(merge_all, "roi_major", ["VISp"])

KeyError: 'roi_major'

In [36]:
for x in merge_all["roi"]:
    if merge_all["roi"].str.contains("VISp"):
        x = "VISp"
    else:
        x = "Other"
    

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# Create the dictionary 
rig_user_dictionary ={'rig_user_name' : "P1", 'Poetry' : 800, 'Comedy' : 1200} 
  
# Add a new column named 'Price' 
df["container_label"] = df["rigOperator"].map(rig_user_dictionary) 

In [None]:
for x in df['Event']:
     if x =='Music':
        x = 1500
    else:
        x = 800