In [2]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from collections import Counter
import time

%matplotlib inline

In [None]:
pwd

'/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics'

In [None]:
out_file_name = "data/processed/arabidopsis_processed_half.csv"
downsize = True

Load Data

In [None]:
column_names = ["chromosome", 
                "start_coord", 
                "end_coord", 
                "sequence", 
                "raw_control_coverage", 
                "raw_treatment_coverage", 
                "norm_control_coverage", 
                "norm_treatment_coverage"]

In [None]:
df = pd.read_csv('data/raw/athal_istarr_hidra.tsv', sep="\t", header=None)
df.columns = column_names

Get rid of rows from the cellular organelles

In [None]:
df.chromosome.unique()

array(['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'chloroplast',
       'mitochondria'], dtype=object)

In [None]:
df = df[df.chromosome.isin(["Chr"+str(i) for i in range(1,6)])]  # keep rows with "Chr" in the chromosome column

In [None]:
df

Unnamed: 0,chromosome,start_coord,end_coord,sequence,raw_control_coverage,raw_treatment_coverage,norm_control_coverage,norm_treatment_coverage
0,Chr1,0,145,CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATCC...,18,6,0.37,0.30
1,Chr1,5,150,AACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATCCCTAAA...,18,6,0.37,0.30
2,Chr1,10,155,TAAACCCTAAACCCTAAACCTCTGAATCCTTAATCCCTAAATCCCT...,18,7,0.37,0.35
3,Chr1,15,160,CCTAAACCCTAAACCTCTGAATCCTTAATCCCTAAATCCCTAAATC...,19,7,0.39,0.35
4,Chr1,20,165,ACCCTAAACCTCTGAATCCTTAATCCCTAAATCCCTAAATCTTTAA...,19,8,0.39,0.40
...,...,...,...,...,...,...,...,...
23829123,Chr5,26975335,26975480,CATGTTAGAAACACTTATACAAGACACTTTATGCAAACTCATAAAC...,39,13,0.81,0.65
23829124,Chr5,26975340,26975485,TAGAAACACTTATACAAGACACTTTATGCAAACTCATAAACCCTAC...,36,13,0.74,0.65
23829125,Chr5,26975345,26975490,ACACTTATACAAGACACTTTATGCAAACTCATAAACCCTACGGTTT...,35,11,0.72,0.55
23829126,Chr5,26975350,26975495,TATACAAGACACTTTATGCAAACTCATAAACCCTACGGTTTAGAGT...,33,11,0.68,0.55


Keep only every nth row

In [None]:
if downsize:
  df = df.iloc[::2, :]  # select every nth row

Get rid of rows with nonstandard characters

In [None]:
odds = [s for s in list(set("".join(df.sequence))) if s not in ["A", "T", "C", "G"]]
odds

['D', 'M', 'Y', 'N', 'R', 'K', 'S', 'W']

In [None]:
mask = df.sequence.str.contains("|".join(odds))  # true if contains weird characters, false if contains only ATCG
df = df[np.logical_not(mask)]  # keep only rows without weird characters

In [None]:
df

Unnamed: 0,chromosome,start_coord,end_coord,sequence,raw_control_coverage,raw_treatment_coverage,norm_control_coverage,norm_treatment_coverage
0,Chr1,0,145,CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATCC...,18,6,0.37,0.30
16,Chr1,80,225,AATCCCTAAATACCTAATTCCCTAAACCCGAAACCGGTTTCTCTGG...,24,9,0.50,0.45
32,Chr1,160,305,CGTTTTTATGTAATTGCTTATTGTTGTGTGTAGATTTTTTAAAAAT...,26,12,0.54,0.60
48,Chr1,240,385,GGTTTTCTTTCCTTCACTTAGCTATGGATGGTTTATCTTCATTTGT...,20,12,0.41,0.60
64,Chr1,320,465,TGGGAATGTGAGTCTCTTATTGTAACCTTAGGGTTGGTTTATCTCA...,23,12,0.48,0.60
...,...,...,...,...,...,...,...,...
23829056,Chr5,26975000,26975145,AAATAAGGTAGAGTGACAATTCTTTCTAATGTTCATTTGAAATAAA...,79,16,1.63,0.80
23829072,Chr5,26975080,26975225,AAAATAAAAATAAAATAAAGTAGAGTGAACCTAACATTTTGGAAGA...,69,24,1.43,1.20
23829088,Chr5,26975160,26975305,CCGACCCGACCACTTAACCACAAAACAATTTCAAAATTATGAAGCA...,73,25,1.51,1.25
23829104,Chr5,26975240,26975385,ATGGGCTTGACCCGCTTGTGCTCACGGCCGGCTCTGGTCTAAACAC...,59,26,1.22,1.30


Get rid of rows with raw_control_coverage < 10 and raw_treatment_coverage < 1


In [None]:
df = df[(df.raw_control_coverage >= 30) & (df.raw_treatment_coverage >= 5)]

In [None]:
df

Unnamed: 0,chromosome,start_coord,end_coord,sequence,raw_control_coverage,raw_treatment_coverage,norm_control_coverage,norm_treatment_coverage
688,Chr1,3440,3585,ATAGTTAATACTACTCGGTTTACTACATGAAATTTCATACCATCAA...,32,5,0.66,0.25
704,Chr1,3520,3665,TAAAACCATACCAATTAAACCGGAGATCCATATTAATTTAATTAAG...,37,10,0.76,0.50
720,Chr1,3600,3745,AAACGCTGACTTCACTGTCTTCCTCCCTCCAAATTATTAGATATAC...,39,13,0.81,0.65
736,Chr1,3680,3825,TACAGATTACAGAGAGCGAGAGAGATCGACGGCGAAGCTCTTTACC...,52,19,1.07,0.95
752,Chr1,3760,3905,TGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCT...,36,11,0.74,0.55
...,...,...,...,...,...,...,...,...
23829056,Chr5,26975000,26975145,AAATAAGGTAGAGTGACAATTCTTTCTAATGTTCATTTGAAATAAA...,79,16,1.63,0.80
23829072,Chr5,26975080,26975225,AAAATAAAAATAAAATAAAGTAGAGTGAACCTAACATTTTGGAAGA...,69,24,1.43,1.20
23829088,Chr5,26975160,26975305,CCGACCCGACCACTTAACCACAAAACAATTTCAAAATTATGAAGCA...,73,25,1.51,1.25
23829104,Chr5,26975240,26975385,ATGGGCTTGACCCGCTTGTGCTCACGGCCGGCTCTGGTCTAAACAC...,59,26,1.22,1.30


Create target column

In [None]:
df["target"] = np.log2(df.norm_control_coverage/df.norm_treatment_coverage)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df

Unnamed: 0,chromosome,start_coord,end_coord,sequence,raw_control_coverage,raw_treatment_coverage,norm_control_coverage,norm_treatment_coverage,target
688,Chr1,3440,3585,ATAGTTAATACTACTCGGTTTACTACATGAAATTTCATACCATCAA...,32,5,0.66,0.25,1.400538
704,Chr1,3520,3665,TAAAACCATACCAATTAAACCGGAGATCCATATTAATTTAATTAAG...,37,10,0.76,0.50,0.604071
720,Chr1,3600,3745,AAACGCTGACTTCACTGTCTTCCTCCCTCCAAATTATTAGATATAC...,39,13,0.81,0.65,0.317482
736,Chr1,3680,3825,TACAGATTACAGAGAGCGAGAGAGATCGACGGCGAAGCTCTTTACC...,52,19,1.07,0.95,0.171611
752,Chr1,3760,3905,TGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCT...,36,11,0.74,0.55,0.428094
...,...,...,...,...,...,...,...,...,...
23829056,Chr5,26975000,26975145,AAATAAGGTAGAGTGACAATTCTTTCTAATGTTCATTTGAAATAAA...,79,16,1.63,0.80,1.026800
23829072,Chr5,26975080,26975225,AAAATAAAAATAAAATAAAGTAGAGTGAACCTAACATTTTGGAAGA...,69,24,1.43,1.20,0.252981
23829088,Chr5,26975160,26975305,CCGACCCGACCACTTAACCACAAAACAATTTCAAAATTATGAAGCA...,73,25,1.51,1.25,0.272620
23829104,Chr5,26975240,26975385,ATGGGCTTGACCCGCTTGTGCTCACGGCCGGCTCTGGTCTAAACAC...,59,26,1.22,1.30,-0.091630


Create set column

In [None]:
# based on the visualization of target values across chromosomes
# we can either pick the train/val chromosomes randomly
# or select the test to be chr2 and the val to be chr4

df["set"] = "train"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
import random

picked = ["Chr2","Chr4"]
random.Random(1202).shuffle(picked)

In [None]:
# set val and test by chromosome we want
df.loc[df.chromosome == picked[0], "set"] = "val"
df.loc[df.chromosome == picked[1], "set"] = "test"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [None]:
picked

['Chr2', 'Chr4']

In [None]:
df.groupby("set").count()

Unnamed: 0_level_0,chromosome,start_coord,end_coord,sequence,raw_control_coverage,raw_treatment_coverage,norm_control_coverage,norm_treatment_coverage,target
set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
test,164452,164452,164452,164452,164452,164452,164452,164452,164452
train,715902,715902,715902,715902,715902,715902,715902,715902,715902
val,170366,170366,170366,170366,170366,170366,170366,170366,170366


In [None]:
df

Unnamed: 0,chromosome,start_coord,end_coord,sequence,raw_control_coverage,raw_treatment_coverage,norm_control_coverage,norm_treatment_coverage,target,set
688,Chr1,3440,3585,ATAGTTAATACTACTCGGTTTACTACATGAAATTTCATACCATCAA...,32,5,0.66,0.25,1.400538,train
704,Chr1,3520,3665,TAAAACCATACCAATTAAACCGGAGATCCATATTAATTTAATTAAG...,37,10,0.76,0.50,0.604071,train
720,Chr1,3600,3745,AAACGCTGACTTCACTGTCTTCCTCCCTCCAAATTATTAGATATAC...,39,13,0.81,0.65,0.317482,train
736,Chr1,3680,3825,TACAGATTACAGAGAGCGAGAGAGATCGACGGCGAAGCTCTTTACC...,52,19,1.07,0.95,0.171611,train
752,Chr1,3760,3905,TGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCT...,36,11,0.74,0.55,0.428094,train
...,...,...,...,...,...,...,...,...,...,...
23829056,Chr5,26975000,26975145,AAATAAGGTAGAGTGACAATTCTTTCTAATGTTCATTTGAAATAAA...,79,16,1.63,0.80,1.026800,train
23829072,Chr5,26975080,26975225,AAAATAAAAATAAAATAAAGTAGAGTGAACCTAACATTTTGGAAGA...,69,24,1.43,1.20,0.252981,train
23829088,Chr5,26975160,26975305,CCGACCCGACCACTTAACCACAAAACAATTTCAAAATTATGAAGCA...,73,25,1.51,1.25,0.272620,train
23829104,Chr5,26975240,26975385,ATGGGCTTGACCCGCTTGTGCTCACGGCCGGCTCTGGTCTAAACAC...,59,26,1.22,1.30,-0.091630,train


In [None]:
# train_out_name = "data/processed/arabidopsis_train.csv"
# test_out_name = "data/processed/arabidopsis_test.csv"
# val_out_name = "data/processed/arabidopsis_val.csv"

# df_train = df[df.set == "train"]
# df_test = df[df.set == "test"]
# df_val = df[df.set == "val"]

# df_train.to_csv(train_out_name, index=False)  # write to file
# df_test.to_csv(test_out_name, index=False)  # write to file
# df_val.to_csv(val_out_name, index=False)  # write to file

In [None]:
df.to_csv(out_file_name, index=False)  # write to file