In [1]:
import pandas as pd

In [2]:
mutations_ARF9_genotyped = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF9/ARF9_TFBSoverlapping_genotyped.tsv'
mutations_ARF9_genotyped_only_mutated = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF9/ARF9_TFBSoverlapping_genotyped_only_mutated.tsv'
mutations_ARF18_genotyped = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF18/ARF18_TFBSoverlapping_genotyped.tsv'
mutations_ARF18_genotyped_only_mutated = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/ARF18/ARF18_TFBSoverlapping_genotyped_only_mutated.tsv'
mutations_DREB26_genotyped = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/DREB26/DREB26_TFBSoverlapping_genotyped.tsv'
mutations_DREB26_genotyped_only_mutated = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/DREB26/DREB26_TFBSoverlapping_genotyped_only_mutated.tsv'
mutations_NLP7_genotyped = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/NLP7/NLP7_TFBSoverlapping_genotyped.tsv'
mutations_NLP7_genotyped_only_mutated = f'../../data/CRISPR_library/pacbio/demultiplexed/Data_Package_Batch_04_04_2022/Sam_Witham_EI_SW_ENQ-5142_A_01_Additional_Barcode_Analysis/Variant_call/NLP7/NLP7_TFBSoverlapping_genotyped_only_mutated.tsv'

In [3]:
#read in files
def read_files(genotyped_file,genotyped_only_mutated_file):
    """read in the files to pandas"""
    #read in files
    genotyped_df = pd.read_table(genotyped_file,sep='\t', header=0)
    genotyped_only_mutated_df = pd.read_table(genotyped_only_mutated_file,sep='\t', header=0)
    return genotyped_df,genotyped_only_mutated_df


In [29]:
def count_mutations(genotyped_df,genotyped_only_mutated_df):
    """function to count number of genotyped plant lines and also number of plant lines containing mutations, and then number of mutations overall"""
    #count number of unique plant IDs
    genotyped_counts = genotyped_df.plant_ID.unique()
    genotyped_only_mutated_counts = genotyped_only_mutated_df.plant_ID.unique()
    #print counts
    print(f'total plant lines sequenced={len(genotyped_counts)} \ntotal mutated plant lines={len(genotyped_only_mutated_counts)}\ntotal mutations={len(genotyped_only_mutated_df)}')
    #count number of chimeric plant lines
    unique_line_mutated = genotyped_only_mutated_df.drop_duplicates(subset='plant_ID',keep='first')
    non_chimeric = unique_line_mutated[unique_line_mutated.genotype != 'chimeric']
    homozygous = unique_line_mutated[unique_line_mutated.genotype == 'homozygous']
    heterozygous = unique_line_mutated[unique_line_mutated.genotype == 'heterozygous']
    biallelic = unique_line_mutated[unique_line_mutated.genotype == 'biallelic']
    #print bold heading
    print('\033[1m'+'Genotypes'+'\033[0m')
    print(f'number of non-chimeric mutated plant lines = {len(non_chimeric)}')
    print(f'number of homozygous mutated plant lines = {len(homozygous)}')
    print(f'number of heterozygous mutated plant lines = {len(heterozygous)}')
    print(f'number of biallelic mutated plant lines = {len(biallelic)}')
    #count number of non-chimeric insertions, deletions, substitutions (if two insertions around same guide, counted as once)
    non_chimeric_all = genotyped_only_mutated_df[genotyped_only_mutated_df.genotype.str.contains('chimeric')]
    insertions = non_chimeric_all[non_chimeric_all.mutation_type.str.contains('insertion')]
    deletions = non_chimeric_all[non_chimeric_all.mutation_type.str.contains('deletion')]
    substitutions = non_chimeric_all[non_chimeric_all.mutation_type.str.contains('substitution')]
    mean_reads_guide_sites = genotyped_df.read_number.mean()
    mean_mutation_reads_guide_sites = genotyped_only_mutated_df.read_number.mean()
    #print bold heading
    print('\033[1m'+'Mutation types'+'\033[0m')
    print(f'number of insertions = {len(insertions)}')
    print(f'number of deletions = {len(deletions)}')
    #deletions over 10bp
    deletions.deletion_positions.str.len()
    deletions_over_10bp = deletions[deletions.deletion_positions.str.len()>10]
    print(f'deletions over 10bp = {len(deletions_over_10bp)}')
    print(f'number of substitutions = {len(substitutions)}')
    print(f'mean reads at guide sites including wt = {mean_reads_guide_sites}')
    print(f'mean reads at guide sites mutations only = {mean_mutation_reads_guide_sites}')
    
    #find number of mutations overlapping FIMO predicted TFBS
    insertions_overlapping_TFBS = insertions[insertions.insertion_overlapping_TFBS_AGI.notna()]
    deletions_overlapping_TFBS = deletions[deletions.deletion_overlapping_TFBS_AGI.notna()]
    substitutions_overlapping_TFBS = substitutions[substitutions.substitution_overlapping_TFBS_AGI.notna()]
    #print bold heading
    print('\033[1m'+'Overlapping TFBSs'+'\033[0m')
    print(f'number of insertions overlapping TFBS = {len(insertions_overlapping_TFBS)}')
    print(f'number of deletions overlapping TFBS = {len(deletions_overlapping_TFBS)}')
    print(f'number of substitutions overlapping TFBS = {len(substitutions_overlapping_TFBS)}')

    #how many within 7bp of cutsite
    #insertions_7bp = insertions[[[i for i in x if int(i) <8] for x in insertions.insertion_cut_site_distance.str.strip('][').str.split(', ')]]

    #print(insertions_7bp)


    return genotyped_counts, genotyped_only_mutated_counts, genotyped_only_mutated_df

In [30]:
ARF9_genotyped_df, ARF9_mutated_df = read_files(mutations_ARF9_genotyped,mutations_ARF9_genotyped_only_mutated)
ARF9_genotyped_plant_lines, ARF9_mutated_plant_lines,ARF9_mutations = count_mutations(ARF9_genotyped_df,ARF9_mutated_df)

total plant lines sequenced=468 
total mutated plant lines=147
total mutations=784
[1mGenotypes[0m
number of non-chimeric mutated plant lines = 144
number of homozygous mutated plant lines = 100
number of heterozygous mutated plant lines = 19
number of biallelic mutated plant lines = 25
[1mMutation types[0m
number of insertions = 577
number of deletions = 185
deletions over 10bp = 101
number of substitutions = 2
mean reads at guide sites including wt = 542.8478016575361
mean reads at guide sites mutations only = 403.78188775510205
[1mOverlapping TFBSs[0m
number of insertions overlapping TFBS = 220
number of deletions overlapping TFBS = 84
number of substitutions overlapping TFBS = 0


  result = np.asarray(values, dtype=dtype)


TypeError: unhashable type: 'list'

In [169]:
ARF18_genotyped_df, ARF18_mutated_df = read_files(mutations_ARF18_genotyped,mutations_ARF18_genotyped_only_mutated)


In [170]:
#filter ARF18 guide 14 since very repetitive and lots of sequencing errors
ARF18_genotyped_only_mutated_df = pd.read_table(mutations_ARF18_genotyped_only_mutated,sep='\t', header=0)
ARF18_mutated_plant_lines_filtered = ARF18_mutated_df[ARF18_mutated_df.guide != 'ARF18_guide14']
#count mutations
ARF18_genotyped_plant_lines, ARF18_mutated_plant_lines,ARF18_mutations = count_mutations(ARF18_genotyped_df,ARF18_mutated_plant_lines_filtered)

total plant lines sequenced=373 
total mutated plant lines=67
total mutations=276
[1mGenotypes[0m
number of non-chimeric mutated plant lines = 66
number of homozygous mutated plant lines = 39
number of heterozygous mutated plant lines = 16
number of biallelic mutated plant lines = 11
[1mMutation types[0m
number of insertions = 219
number of deletions = 44
deletions over 10bp = 35
number of substitutions = 0
mean reads at guide sites including wt = 632.2754770435797
mean reads at guide sites mutations only = 363.7644927536232
[1mOverlapping TFBSs[0m
number of insertions overlapping TFBS = 179
number of deletions overlapping TFBS = 30
number of substitutions overlapping TFBS = 0


In [171]:
DREB26_genotyped_df, DREB26_mutated_df = read_files(mutations_DREB26_genotyped,mutations_DREB26_genotyped_only_mutated)
DREB26_genotyped_plant_lines, DREB26_mutated_plant_lines,DREB26_mutations = count_mutations(DREB26_genotyped_df,DREB26_mutated_df)

total plant lines sequenced=366 
total mutated plant lines=43
total mutations=288
[1mGenotypes[0m
number of non-chimeric mutated plant lines = 42
number of homozygous mutated plant lines = 24
number of heterozygous mutated plant lines = 16
number of biallelic mutated plant lines = 2
[1mMutation types[0m
number of insertions = 137
number of deletions = 115
deletions over 10bp = 57
number of substitutions = 5
mean reads at guide sites including wt = 371.8438795745448
mean reads at guide sites mutations only = 180.64236111111111
[1mOverlapping TFBSs[0m
number of insertions overlapping TFBS = 114
number of deletions overlapping TFBS = 102
number of substitutions overlapping TFBS = 4


In [172]:
NLP7_genotyped_df, NLP7_mutated_df = read_files(mutations_NLP7_genotyped,mutations_NLP7_genotyped_only_mutated)
NLP7_genotyped_plant_lines, NLP7_mutated_plant_lines,NLP7_mutations = count_mutations(NLP7_genotyped_df,NLP7_mutated_df)

total plant lines sequenced=478 
total mutated plant lines=112
total mutations=375
[1mGenotypes[0m
number of non-chimeric mutated plant lines = 108
number of homozygous mutated plant lines = 59
number of heterozygous mutated plant lines = 34
number of biallelic mutated plant lines = 15
[1mMutation types[0m
number of insertions = 248
number of deletions = 89
deletions over 10bp = 38
number of substitutions = 2
mean reads at guide sites including wt = 692.6322322598442
mean reads at guide sites mutations only = 486.6426666666667
[1mOverlapping TFBSs[0m
number of insertions overlapping TFBS = 82
number of deletions overlapping TFBS = 16
number of substitutions overlapping TFBS = 0
