In [None]:
#Running this shell will install/import necessary libraries to format dataseet.

import pandas as pd
import numpy as np

In [None]:
##Loading Mutation Dataset and Relevant Information for Processing

variant_for_normalization = 'GCaMP6s' #@param {type:"string"}

variant_column_name ='GCaMP3 Variant' #@param {type:"string"}

biophysical_property = '1 AP !F/F0' #@param {type:"string"}

biophysical_property_final = '1 AP ∆F/F0' #@param {type:"string"}

mutations_column = 'Mutations added to GCaMP3' #@param {type:"string"}

base_variant_sequence = 'MRGSHHHHHHGMASMTGGQQMGRDLYDDDDKDLATMVDSSRRKWNKTGHAVRAIGRLSSLENVYIKADKQKNGIKANFKIRHNIEDGGVQLAYHYQQNTPIGDGPVLLPDNHYLSVQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKGGTGGSMVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYIQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNTRDQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGDGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK' #@param {type:"string"}

csv_file_path = 'backend_data/GCaMP6_lib.csv' #@param {type:"string"}
variant_library = pd.read_csv(csv_file_path,header = 0)



#read in mutation library for GCaMP6
chen_data = pd.read_csv(csv_file_path)

#isolate the column that contains the mutations added to each variant
mutation_data = chen_data[mutations_column]

#splits the string into individual mutations
for i in range(len(mutation_data)):
    txt_mut = str(mutation_data[i])
    if txt_mut == 'base':
        mutation_data.at[i] = ['']
    else:
        x = txt_mut.split()
        mutation_data.at[i] = x

#saves the mutation information with the variants in which they belong...
#in addition to the biophyscial characteristic
mutation_df = pd.DataFrame({'Mutations': mutation_data,
                              'Variant ID': chen_data[variant_column_name],
                              biophysical_property_final:
                              chen_data[biophysical_property]})



#columns for output dataframe
cols = list(np.arange(0,451))
cols.append('Variant ID')
cols.append(biophysical_property_final)

##Formatting Mutation Strings To Residue/Amino Acid Format

list_of_mutation_location = [] #used to save position information
list_of_mutation_aa = [] #used to save amino acid information
for row in range(len(mutation_df)): #takes each variant found in the mutation df

        #pulls the mutations found in each variant (type: list of strings)
        mutation_location = mutation_df["Mutations"].iloc[row]

        #interloop datasaving locations
        residue_list = []
        aa_change_list = []

        #find the residue locations and to which amino acid the mutation was made
        for iterator in range(len(mutation_location)): #takes each mutation found in single variant
            #isolate mutation at iterator
            position_mutation_location = mutation_location[iterator]
            #finds all of the digit values in the mutation and joins them to ...
            # isolate residue location
            numeric_string = "".join(filter(str.isdigit, position_mutation_location))

            if numeric_string == "": #this case would only happen if the construct is base
                residue_list.append('') #no residue locations to mutate
                aa_change_list.append('') #no amino acids to mutate to
            else:
                #returns int type of residue location
                residue_list.append(int(numeric_string))
                #returns str type of final amino acid mutation
                aa_change_list.append(mutation_location[iterator][-1])

        #Save interloop list to exterior datasaving list
        #expected: len(residue_list) = len(mutation_location)
        #expected: len(list_of_mutation_location) = len(mutation_df)
        #expected: len(list_of_mutation_aa) = len(mutation_df)
        list_of_mutation_location.append(residue_list)
        list_of_mutation_aa.append(aa_change_list)

#write exterior saving locations to Pandas Series in maintain index information
appending_list_mutation_location = pd.Series(list_of_mutation_location, index = mutation_df.index)
appending_list_mutation_aa = pd.Series(list_of_mutation_aa, index = mutation_df.index)


df_seq = pd.DataFrame() #external save location

for row in range(len(mutation_df)):

    #initialize the base sequence for each loop
    x = [e for e in base_variant_sequence]

    #isolates the mutated residues/amino acids for each row
    mutation_loc = appending_list_mutation_location[row]
    mutation_aa = appending_list_mutation_aa[row]


    if type(mutation_loc[0]) is int: #tests to see if row is the base construct
        for mut in range(len(mutation_loc)):
            x[mutation_loc[mut]] = mutation_aa[mut] #inplace mutation onto base sequence
        x.append(mutation_df['Variant ID'].loc[row]) #append the variants primary key
        x.append(mutation_df[biophysical_property_final].loc[row]) #append variant's dependent information
        concat_df = pd.DataFrame([x], columns = cols)
        df_seq = pd.concat([df_seq,concat_df]) #append row's dataframe with external dataframe

    else:
        x.append(mutation_df['Variant ID'].loc[row])#append the variants primary key
        x.append(mutation_df[biophysical_property_final].loc[row])#append variant's dependent information
        concat_df = pd.DataFrame([x], columns = cols)
        df_seq = pd.concat([df_seq,concat_df])#append row's dataframe with external dataframe

#renormalize chen dataset to GCaMP6s == 1.0 for 1 AP
#find the value for GCaMP6s
g6s_data = df_seq[df_seq['Variant ID'] == variant_for_normalization][biophysical_property_final].values[0]
#divide the biophysical property column by the GCaMP6s value
df_seq[biophysical_property_final] = df_seq[biophysical_property_final]/g6s_data

df_seq.set_index('Variant ID', append = False, inplace = True)
df_seq_chen = df_seq
display(df_seq_chen)


assert len(mutation_df) == len(df_seq)
assert len(df_seq.columns) == len(base_variant_sequence) + 1
assert len(np.unique(df_seq.index.values)) == len(df_seq)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,442,443,444,445,446,447,448,449,450,1 AP ∆F/F0
Variant ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCaMP3,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.064977
GCaMP5G,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.124107
GCaMP6s,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,1.000000
GCaMP6m,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.862248
GCaMP6f,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.565952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.067576
694,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.091618
695,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.019493
696,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.104613


In [None]:
##Loading Mutation Dataset and Relevant Information for Processing

variant_for_normalization = 'GCaMP6s' #@param {type:"string"}

variant_column_name ='GCaMP6s variant' #@param {type:"string"}

biophysical_property = '1 AP _F/F0' #@param {type:"string"}

biophysical_property_final = '1 AP ∆F/F0' #@param {type:"string"}

mutations_column = 'Mutations added to GCaMP6s' #@param {type:"string"}

base_variant_sequence = 'MRGSHHHHHHGMASMTGGQQMGRDLYDDDDKDLATMVDSSRRKWNKTGHAVRAIGRLSSLENVYIKADKQKNGIKANFHIRHNIEDGGVQLAYHYQQNTPIGDGPVLLPDNHYLSVQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKGGTGGSMVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYIQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNLPDQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGDGTIDFPEFLTMMARKMKYRDTEEEIREAFGVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK'
csv_file_path = 'backend_data/gcamp7_lib.csv' #@param {type:"string"}
variant_library = pd.read_csv(csv_file_path,header = 0)



#read in mutation library for GCaMP6
dana_data = pd.read_csv(csv_file_path)

#isolate the column that contains the mutations added to each variant
mutation_data = dana_data[mutations_column]

#splits the string into individual mutations
for i in range(len(mutation_data)):
    txt_mut = str(mutation_data[i])
    if txt_mut == 'base':
        mutation_data.at[i] = ['']
    else:
        x = txt_mut.split()
        mutation_data.at[i] = x

#saves the mutation information with the variants in which they belong...
#in addition to the biophyscial characteristic
mutation_df = pd.DataFrame({'Mutations': mutation_data,
                              'Variant ID': dana_data[variant_column_name],
                              biophysical_property_final:
                              dana_data[biophysical_property]})



#columns for output dataframe
cols = list(np.arange(0,451))
cols.append('Variant ID')
cols.append(biophysical_property_final)

##Formatting Mutation Strings To Residue/Amino Acid Format

list_of_mutation_location = [] #used to save position information
list_of_mutation_aa = [] #used to save amino acid information
for row in range(len(mutation_df)): #takes each variant found in the mutation df

        #pulls the mutations found in each variant (type: list of strings)
        mutation_location = mutation_df["Mutations"].iloc[row]

        #interloop datasaving locations
        residue_list = []
        aa_change_list = []

        #find the residue locations and to which amino acid the mutation was made
        for iterator in range(len(mutation_location)): #takes each mutation found in single variant
            #isolate mutation at iterator
            position_mutation_location = mutation_location[iterator]
            #finds all of the digit values in the mutation and joins them to ...
            # isolate residue location
            numeric_string = "".join(filter(str.isdigit, position_mutation_location))

            if numeric_string == "": #this case would only happen if the construct is base
                residue_list.append('') #no residue locations to mutate
                aa_change_list.append('') #no amino acids to mutate to
            else:
                #returns int type of residue location
                residue_list.append(int(numeric_string))
                #returns str type of final amino acid mutation
                aa_change_list.append(mutation_location[iterator][-1])

        #Save interloop list to exterior datasaving list
        #expected: len(residue_list) = len(mutation_location)
        #expected: len(list_of_mutation_location) = len(mutation_df)
        #expected: len(list_of_mutation_aa) = len(mutation_df)
        list_of_mutation_location.append(residue_list)
        list_of_mutation_aa.append(aa_change_list)

#write exterior saving locations to Pandas Series in maintain index information
appending_list_mutation_location = pd.Series(list_of_mutation_location, index = mutation_df.index)
appending_list_mutation_aa = pd.Series(list_of_mutation_aa, index = mutation_df.index)


df_seq = pd.DataFrame() #external save location

for row in range(len(mutation_df)):

    #initialize the base sequence for each loop
    x = [e for e in base_variant_sequence]

    #isolates the mutated residues/amino acids for each row
    mutation_loc = appending_list_mutation_location[row]
    mutation_aa = appending_list_mutation_aa[row]


    if type(mutation_loc[0]) is int: #tests to see if row is the base construct
        for mut in range(len(mutation_loc)):
            x[mutation_loc[mut]] = mutation_aa[mut] #inplace mutation onto base sequence
        x.append(mutation_df['Variant ID'].loc[row]) #append the variants primary key
        x.append(mutation_df[biophysical_property_final].loc[row]) #append variant's dependent information
        concat_df = pd.DataFrame([x], columns = cols)
        df_seq = pd.concat([df_seq,concat_df]) #append row's dataframe with external dataframe

    else:
        x.append(mutation_df['Variant ID'].loc[row])#append the variants primary key
        x.append(mutation_df[biophysical_property_final].loc[row])#append variant's dependent information
        concat_df = pd.DataFrame([x], columns = cols)
        df_seq = pd.concat([df_seq,concat_df])#append row's dataframe with external dataframe


#renormalize chen dataset to GCaMP6s == 1.0 for 1 AP
#find the value for GCaMP6s
g6s_data = df_seq[df_seq['Variant ID'] == variant_for_normalization][biophysical_property_final].values[0]
#divide the biophysical property column by the GCaMP6s value
df_seq[biophysical_property_final] = df_seq[biophysical_property_final]/g6s_data


df_seq.set_index('Variant ID', append = False, inplace = True)
df_seq_dana = df_seq
display(df_seq_dana)

assert len(mutation_df) == len(df_seq)
assert len(df_seq.columns) == len(base_variant_sequence) + 1
assert len(np.unique(df_seq.index.values)) == len(df_seq)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,442,443,444,445,446,447,448,449,450,1 AP ∆F/F0
Variant ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCaMP3,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.20
GCaMP5G,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.37
GCaMP6s,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,1.00
GCaMP6f,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.67
10.699,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10.1608,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,1.59
10.1609,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,1.33
10.161,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,3.08
10.1611,M,R,G,S,H,H,H,H,H,H,...,E,F,V,Q,M,M,T,A,K,2.29


In [None]:
## Combining the Two Datasets:
#concatenate the two sequence libraries
combined_df = df_seq_chen.append(df_seq_dana[1:])
combined_df['Variant ID'] = combined_df.index
print(len(combined_df))

1111


In [None]:
combined_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,443,444,445,446,447,448,449,450,1 AP ∆F/F0,Variant ID
Variant ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCaMP3,M,R,G,S,H,H,H,H,H,H,...,F,V,Q,M,M,T,A,K,0.064977,GCaMP3
GCaMP5G,M,R,G,S,H,H,H,H,H,H,...,F,V,Q,M,M,T,A,K,0.124107,GCaMP5G
GCaMP6s,M,R,G,S,H,H,H,H,H,H,...,F,V,Q,M,M,T,A,K,1.000000,GCaMP6s
GCaMP6m,M,R,G,S,H,H,H,H,H,H,...,F,V,Q,M,M,T,A,K,0.862248,GCaMP6m
GCaMP6f,M,R,G,S,H,H,H,H,H,H,...,F,V,Q,M,M,T,A,K,0.565952,GCaMP6f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10.1608,M,R,G,S,H,H,H,H,H,H,...,F,V,Q,M,M,T,A,K,1.590000,10.1608
10.1609,M,R,G,S,H,H,H,H,H,H,...,F,V,Q,M,M,T,A,K,1.330000,10.1609
10.161,M,R,G,S,H,H,H,H,H,H,...,F,V,Q,M,M,T,A,K,3.080000,10.161
10.1611,M,R,G,S,H,H,H,H,H,H,...,F,V,Q,M,M,T,A,K,2.290000,10.1611


In [None]:
## Duplicate Values Check

cols_1 = list(np.arange(0,len(base_variant_sequence)))

#isolate rows that contain duplicated values
duplicated_seq_df = combined_df[combined_df.duplicated(cols_1, keep = False)]

if (len(duplicated_seq_df))>0:
    print('Found '+str(len(duplicated_seq_df))+ ' duplicated rows! Cleaning up data now!')

    #isolate just the full sequence
    duplicated_seq_df['full seq'] = ["".join(list(duplicated_seq_df[cols_1]
                                                .iloc[e].values)) for e in
                                  range(len(duplicated_seq_df))]

    #aggregate the data based on the full sequence & group data by mean of group
    x = duplicated_seq_df.groupby('full seq')
    y = duplicated_seq_df.groupby('full seq').mean(numeric_only = True)

    #create new dataframe with aggregated samples
    new_df = pd.DataFrame(columns = cols)
    #isolate groups of duplicated data
    for index in y.index:
        duplicated_sequences = x.get_group(index)

        #give them a new variant ID
        new_variant_id = list(duplicated_sequences['Variant ID'].values)
        #find the average performance from all the duplicated variants
        averaged_prop = np.mean(
            list(duplicated_sequences[biophysical_property_final].values))
        #isolate the sequence + append information
        sequence_list = list(duplicated_sequences[cols_1].iloc[0].values)
        sequence_list.append(new_variant_id)
        sequence_list.append(averaged_prop)
        #append Data to external save dataframe
        new_df = pd.concat([new_df, pd.DataFrame([sequence_list], columns = cols)],
                          ignore_index = True)

    df_seq = combined_df.append(new_df)
    df_seq = df_seq.drop_duplicates(cols_1, keep = 'last')

    print('Cleaned up dataset:')
    print(df_seq)
else:
    print('No duplicate rows found!')
    df_seq = combined_df


assert len(df_seq[df_seq.duplicated(cols_1, keep = False)]) == 0
assert len(df_seq.columns) == len(base_variant_sequence) + 2

Found 58 duplicated rows! Cleaning up data now!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicated_seq_df['full seq'] = ["".join(list(duplicated_seq_df[cols_1]


Cleaned up dataset:
         0  1  2  3  4  5  6  7  8  9  ... 443 444 445 446 447 448 449 450  \
GCaMP3   M  R  G  S  H  H  H  H  H  H  ...   F   V   Q   M   M   T   A   K   
GCaMP6m  M  R  G  S  H  H  H  H  H  H  ...   F   V   Q   M   M   T   A   K   
3        M  R  G  S  H  H  H  H  H  H  ...   F   V   Q   M   M   T   A   K   
30       M  R  G  S  H  H  H  H  H  H  ...   F   V   Q   M   M   T   A   K   
31       M  R  G  S  H  H  H  H  H  H  ...   F   V   Q   M   M   T   A   K   
...     .. .. .. .. .. .. .. .. .. ..  ...  ..  ..  ..  ..  ..  ..  ..  ..   
20       M  R  G  S  H  H  H  H  H  H  ...   F   V   Q   M   M   T   A   K   
21       M  R  G  S  H  H  H  H  H  H  ...   F   V   Q   M   M   T   A   K   
22       M  R  G  S  H  H  H  H  H  H  ...   F   V   Q   M   M   T   A   K   
23       M  R  G  S  H  H  H  H  H  H  ...   F   V   Q   M   M   T   A   K   
24       M  R  G  S  H  H  H  H  H  H  ...   F   V   Q   M   M   T   A   K   

        1 AP ∆F/F0                         

In [None]:
#Data Saving
df_seq.to_csv('combined_dataset.csv')