In [1]:
# packages and scripts
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import re
import scripts.functions as sf

# load data
with open('data/raw/baseline_data.pkl', 'rb') as f:
    base_data = pickle.load(f)

# load data
with open('data/raw/dreadd_data.pkl', 'rb') as f:
    dreadd_data = pickle.load(f)

# load data
with open('data/raw/new_data.pkl', 'rb') as f:
    new_data = pickle.load(f)

# add condition column
dreadd_data['condition'] = dreadd_data['Animal'].apply(lambda x: 'GH DREADDs' if x in ['AMA369', 'AMA422', 'AMA423', 'AMA424'] 
                                         else 'GH' if x in ['AMA370', 'AMA371']
                                         else 'ISO DREADDs')

base_data['condition'] = base_data['Animal'].apply(lambda x: 'GH' if x in ['AMA448', 'AMA449'] else ('ISO' if x in ['AMA450', 'AMA451', 'AMA391'] else None))

new_data['condition'] = new_data['Animal'].apply(lambda x: 'GH' if x in ['AMA461', 'AMA459'] else ('ISO'))

# base data has duplicate rows for AMA391 so drop them
base_data = base_data.drop_duplicates()

# put datasets in list so we can loop through them later
datasets = [base_data, dreadd_data, new_data]


  base_data = pickle.load(f)
  dreadd_data = pickle.load(f)
  new_data = pickle.load(f)


In [2]:
# run consistent function on list of datasets and bind
for i in range(len(datasets)):
    datasets[i] = sf.consistent(datasets[i])

binded_data = pd.concat(datasets)

## save clean datasets as pickle files
with open('data/raw/binded_data.pkl', 'wb') as file:
    pickle.dump(binded_data, file)

In [3]:
# remove alfredos mice
data = binded_data[~binded_data['animal'].str.contains('GG')].copy()

# get a list of mice and blocks, we'll loop over these later
mice = data.animal.unique()
blocks = data.block.unique()
blocks = [b for b in blocks if b != 'NoRule']

# remove na from correct? col
data.dropna(subset = ['correct?'], inplace = True)

# correct some stimuli names so they are consistent for the model
# please don't change order as regex searches and replaces substrings
# e.g car is substring of cardboard, so creates carddboard, which we the change to just card
data = data.replace('Metal Strip', 'MetalStrip', regex = True)
data = data.replace('Metal strip', 'MetalStrip', regex = True)
data = data.replace('Poppuri', 'Popurri', regex = True)
data = data.replace('Car', 'Card', regex = True)
data = data.replace('Carddboard', 'Card', regex = True)
data = data.replace('WhBedd', 'WhBed', regex = True)
data = data.replace('Whbed', 'WhBed', regex = True)
data = data.replace('Turmeric', 'Tumeric', regex = True)
data = data.replace('Tumeric ', 'Tumeric', regex = True)
data = data.replace('Cardd', 'Card', regex = True)

# now run the function to clean the data
clean_data = sf.clean_ied(data)

# now lets standardize the features
clean_data = sf.standardize_features(clean_data)

# if you want to remove the 2nd day from each block
#clean_data = clean_data[clean_data['day'] == '1']

# now lets save the clean data tabular data in csv format 
clean_data.to_csv('data/clean/clean_data.csv', index = False)

In [4]:
## now run through function to build the datalist

## split by group
mice_ghd = clean_data.loc[clean_data['condition'] == 'GH DREADDs']['mouse'].unique()
ghc = ['GH', 'GH mCherry']
mice_ghc = clean_data.loc[clean_data['condition'].isin(ghc)]['mouse'].unique()
mice_isod = clean_data.loc[clean_data['condition'] == 'ISO DREADDs']['mouse'].unique()
isoc = ['ISO', 'ISO mCherry']
mice_isoc = clean_data.loc[clean_data['condition'].isin(isoc)]['mouse'].unique()

# all mice
mice = clean_data['mouse'].unique()

## run function to wrangle the data into a format the RL models will use 
clean_model_data_ghd = [sf.make_ast_model_data(clean_data, mouse) for mouse in mice_ghd]
clean_model_data_isod = [sf.make_ast_model_data(clean_data, mouse) for mouse in mice_isod]
clean_model_data_ghc = [sf.make_ast_model_data(clean_data, mouse) for mouse in mice_ghc]
clean_model_data_isoc = [sf.make_ast_model_data(clean_data, mouse) for mouse in mice_isoc]

# all mice
clean_model_data = [sf.make_ast_model_data(clean_data, mouse) for mouse in mice]

## save clean datasets as pickle files
with open('data/modelling/datalists/ghd_r0.pkl', 'wb') as file:
    pickle.dump(clean_model_data_ghd, file)
with open('data/modelling/datalists/isod_r0.pkl', 'wb') as file:
    pickle.dump(clean_model_data_isod, file)
with open('data/modelling/datalists/ghc_r0.pkl', 'wb') as file:
    pickle.dump(clean_model_data_ghc, file)
with open('data/modelling/datalists/isoc_r0.pkl', 'wb') as file:
    pickle.dump(clean_model_data_isoc, file)

# save all
with open('data/modelling/datalists/all_r0.pkl', 'wb') as file:
    pickle.dump(clean_model_data, file)

In [5]:
# let's check the data 
sf.sanity_check(clean_model_data)

Passed: Last 8 choices are all rewarded!
Passed: Inital dimension coded correctly!
