# Coding Project Analysis JEM Forms


### Data Output: Date, Specimen ID, Rig Opertator, Layer, Cell Type Prediction (Human), Patch Duration

In [None]:
import os
import csv
import json
import fnmatch
import numpy as np
import pandas as pd

import JEM_post_patch as pp #post patch script
#post_patch_column = 'extraction.postPatch'
#post_patch_pipette_column = 'extraction.endPipetteR'

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from pandas.io.json import json_normalize
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
#from single_cell_ephys.lims_funcs import limsquery
#from single_cell_ephys.file_funcs import validated_input

In [None]:
pd.options.display.max_columns = None  #Displays all the columns 

In [None]:
#Used csv from 5/25/2018
shiny_human_df = pd.read_csv('C:/users/ramr/documents/github/personal-projects/csv/shiny_human.csv')
shiny_mouse_df = pd.read_csv('C:/users/ramr/documents/github/personal-projects/csv/shiny_mouse.csv')

In [None]:
#List of common functions(In progress)

def gen_fil(df, col, cond):
    '''
    Filters the column with a certain condition
    df: dataframe
    col: column
    cond: condition
    '''
    filtered = df[df[col] == cond]
    return filtered

def gen_cat(df, col):
    '''
    Assigns astype to the column
    df: dataframe
    col: column
    '''    
    category = df[col].astype('category')
    return category

def cell_count(P_num):
    '''
    Counts number of Lims tube id based on P# as number of cells
    df = dataframe
    count() = counts number
    '''
    user = df1[df1['Lims tube id'].str.contains(P_num)]
    return user['Lims tube id'].count()

def P_user(P_num):
    '''
    Prints cell_count 
    '''
    print("Total cells:", cell_count(P_num))

In [None]:
def get_jsons(dirname, expt):
    json_paths = []
    for jfile in os.listdir(dirname):
        if fnmatch.fnmatch(jfile, '*.%s.json' %expt):
            jpath = os.path.join(dirname, jfile)
            json_paths.append(jpath)
    return json_paths

In [None]:
#make list of json paths for every json file in MIES Experiments directory on 279
json_list = get_jsons("//allen/programs/celltypes/workgroups/279/Patch-Seq/all-metadata-files/", "PS")
#json_list

In [None]:
def flatten_attempts(slice_info, form_version):
    """Return flattened slice metadata dataframe.
    
    Parameters
    ----------
    slice_info : dict
        A dictionary of slices with nested pipette attempts.
    
    form_version : string
        A string containing the JEM form version.
        (Pre-version 2 contains IVSCC, PatchSeq and Electroporation arrays)
    Returns
    -------
    attempts_slice_df : pandas dataframe
        A dataframe of all pipette attempts along with all slice metadata.
    """
    
    df = json_normalize(slice_info)
    if form_version >= "2":
        ps_array_name = "pipettes"
    else:
        ps_array_name = "pipettesPatchSeqPilot"
    try:
        attempts_df = json_normalize(slice_info[ps_array_name])
        attempts_df["limsSpecName"] = df["limsSpecName"].values[0]
        attempts_df["attempt"] = [p+1 for p in attempts_df.index.values]
        attempts_slice_df = pd.merge(df, attempts_df, how="outer", on="limsSpecName")
        attempts_slice_df.drop(ps_array_name, axis=1, inplace=True)
        return attempts_slice_df
    except IndexError:
        pass

def is_field(df, colname):
    """Determine whether a column name exists in a dataframe.
    
    Parameters
    ----------
    df : a Pandas dataframe
    colname : string
        
    Returns
    -------
    Boolean
        Boolean value indicating if the colname exists in the dataframe.
    """
    
    try:
        df[colname]
        return True
    except KeyError:
        return False

### json_df

In [None]:
json_df = pd.DataFrame()

for json_path in json_list:
    with open(json_path) as data_file:
        slice_info = json.load(data_file)
        if is_field(slice_info, "formVersion"):
            jem_version = slice_info["formVersion"]
        else:
            jem_version = "1.0.0"
        flat_df = flatten_attempts(slice_info, jem_version)
        json_df = pd.concat([json_df, flat_df], axis=0)
        
json_df.head()

In [None]:
json_df['extraction.timeRetractionEnd'] = json_df['extraction.timeRetractionEnd'].str[:8]
json_df['recording.timeWholeCellStart'] = json_df['recording.timeWholeCellStart'].str[:8]

In [None]:
json_df['patch_duration'] = pd.to_datetime(json_df['extraction.timeRetractionEnd']) - pd.to_datetime(json_df['recording.timeWholeCellStart'])

In [None]:
json_df['patch_duration'] = json_df['patch_duration']/60

In [None]:
json_df['patch_duration'] = json_df['patch_duration'].astype('str')

In [None]:
#json_df1['patch_duration'] = json_df1['patch_duration'].str[6:]

In [None]:
json_df.info()

### Fixing up date column & adding post patch column

In [None]:
json_df['date'] = json_df['date'].str[:10] #Strip away the time
json_df['date'] = pd.to_datetime(json_df['date']) #Converting to YYYY-MM-DD
json_df.set_index('date', inplace = True)
json_df = pp.postpatch_reclass(json_df) #Makes post patch column with the script
json_df.sort_index(inplace = True) #Sort the date column just in case

#start_date = input('Enter the start date for your dataframe (YYYY-MM-DD): ')
start_date = '2017-10-01'
json_df1 = json_df[start_date:]
json_df1 = gen_fil(json_df1, 'status', 'SUCCESS') 
json_df1.sort_index(inplace = True) #Sort the date column just in case

In [None]:
json_df1.head()

In [None]:
json_df1.tail()

### Filter for Human & Mouse dataframe

In [None]:
json_df1['human?'] = json_df1['limsSpecName'].str.match(r"H\d\d") #Creates new boolean column

json_df2 = gen_fil(json_df1, 'human?', True) #HUMAN
json_df3 = gen_fil(json_df1, 'human?', False) #MOUSE
json_df1.head()

## HUMAN dataframe

In [None]:
json_df2 = json_df2.reset_index()
human_df = json_df2[['date',
                     'limsSpecName',
                     'rigOperator',
                     'approach.manualRoi',
                     'manualRoi', 
                     'recording.humanCellTypePrediction',
                     'post_patch',
                     'extraction.tubeID']]

human_df.sort_index(inplace = True)
#approach.manualRoi available data at 2017-10-10 to 2017-11-27
#manualRoi avaible data at 2017-11-27 to Present
#extraction.tubeID changes from ### to User_Date_###_A01 at 2017-11-27

shiny_human_df = shiny_human_df.loc[:,['sample_id',
                                    'cell_name_label',
                                    'res_index_label']]

final_H_df = pd.merge(left = human_df, 
                      right = shiny_human_df, 
                      left_on = 'extraction.tubeID', 
                      right_on = 'sample_id', 
                      how = 'inner')

In [None]:
final_H_df.head()

In [None]:
final_H_df.tail()

In [None]:
human_df['post_patch'].value_counts()

In [None]:
final_H_df['post_patch'].value_counts()

## MOUSE dataframe

In [None]:
json_df3 = json_df3.reset_index()
mouse_df = json_df3[['date',
                     'limsSpecName',
                     'rigOperator',
                     'approach.manualRoi',
                     'manualRoi', 
                     'post_patch',
                     'extraction.tubeID']]

mouse_df.sort_index(inplace = True)
#approach.manualRoi available data at 2017-10-02 to 2017-11-10
#manualRoi avaible data at 2017-11-10 to Present
#extraction.tubeID changes from ### to User_Date_###_A01 at 2017-11-10

shiny_mouse_df = shiny_mouse_df.loc[:,['sample_id',
                                       'cell_name_label',
                                       'res_index_label']]

final_M_df = pd.merge(left = mouse_df, 
                      right = shiny_mouse_df, 
                      left_on = 'extraction.tubeID', 
                      right_on = 'sample_id', 
                      how = 'inner')

In [None]:
final_M_df.head()

In [None]:
final_M_df.tail()

In [None]:
mouse_df['post_patch'].value_counts()

In [None]:
final_M_df['post_patch'].value_counts()

## Extra

In [None]:
def nuc_count(df):
    #user = str(input('Enter the user number (P#): '))
    user = 'PA'
    
    nh = df[df['post_patch'].str.contains('Nuc-high seal') & 
            df['extraction.tubeID'].str.contains(user)]
    nh = nh['post_patch'].count()

    nl = df[df['post_patch'].str.contains('Nuc-low seal') & 
            df['extraction.tubeID'].str.contains(user)]
    nl = nl['post_patch'].count()

    noh = df[df['post_patch'].str.contains('No-high seal') & 
             df['extraction.tubeID'].str.contains(user)]
    noh = noh['post_patch'].count()

    nol = df[df['post_patch'].str.contains('No-low seal') & 
             df['extraction.tubeID'].str.contains(user)]
    nol = nol['post_patch'].count()

    en = df[df['post_patch'].str.contains('Entire cell') & 
            df['extraction.tubeID'].str.contains(user)]
    en = en['post_patch'].count()
    

    print 'Nucleated:', nh
    print 'Partial nucleated', nl
    print 'Outside-out:', noh
    print 'No seal:', nol
    print 'Entire cell:', en   

In [None]:
nuc_count(mouse_df)

In [None]:
nuc_count(final_M_df)

In [None]:
nuc_count(human_df)

In [None]:
nuc_count(final_H_df)

In [None]:
#Problem: index date column doesn't show dates after forming new combined df with shiny