In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import pandas as pd
import shutil
from tqdm import tqdm

In [None]:
f1=pd.read_csv("/content/drive/MyDrive/Hereford_VarAnalysis/Hereford_raw_data/Hereford_protein.tsv", delimiter='\t')

In [None]:
f1.columns

Index(['Accession', 'Begin', 'End', 'Chromosome', 'Orientation', 'Name',
       'Symbol', 'Gene ID', 'Gene Type', 'Transcripts accession',
       'Protein accession', 'Protein length', 'Locus tag'],
      dtype='object')

In [None]:
unique_chr=f1["Chromosome"].unique()
unique_chr

array(['MT', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', 'X', 'Y', nan],
      dtype=object)

In [None]:
for chr in unique_chr:
  # extract the data as chromosome wise
  chr_df=f1[f1["Chromosome"]==chr]

  # save the out file of hereford(HF) based on chromosome
  out_file=f"HF{chr}.txt"

  # save the DataFrame as a seperate file
  chr_df.to_csv(out_file, sep='\t', index=False)

  # save the out file into drive
  shutil.copy(out_file, "/content/drive/MyDrive/Hereford_VarAnalysis/Hereford_Protein" )

In [None]:
qtl_df=pd.read_csv('/content/drive/MyDrive/QTL_analysis_051223/QTL_ARS_UCD1_2/qtl_data_Chr.1.txt', delimiter='\t', encoding='latin1')

In [None]:
qtl_df.shape

(3579, 6)

In [None]:
qtl_df.head()

Unnamed: 0,Chromosome,Source,QTL Class,QTL Span1,QTL Span2,Description
0,Chr.2,Animal QTLdb,Meat_and_Carcass_Association,90562,90566,QTL_ID=225243;Name=Shear force;Abbrev=;PUBMED_...
1,Chr.2,Animal QTLdb,Reproduction_Association,292762,292766,QTL_ID=39029;Name=Pregnancy rate;Abbrev=;PUBME...
2,Chr.2,Animal QTLdb,Reproduction_Association,292762,292766,QTL_ID=39030;Name=Stillbirth;Abbrev=;PUBMED_ID...
3,Chr.2,Animal QTLdb,Production_Association,292762,292766,QTL_ID=39031;Name=PTA type;Abbrev=;PUBMED_ID=2...
4,Chr.2,Animal QTLdb,Exterior_Association,292762,292766,QTL_ID=39032;Name=Udder attachment;Abbrev=;PUB...


In [None]:
hf_df=pd.read_csv("/content/drive/MyDrive/Hereford_VarAnalysis/Hereford_Protein/HF1.txt", delimiter="\t")

In [None]:
hf_df.shape

(1643, 13)

In [None]:
# Add column as start of next gene to find variation in intergenic region
hf_df["start_of_next_gene"] = hf_df["Begin"].shift(-1).fillna(0)

In [None]:
hf_df.shape

(1566, 14)

In [None]:
hf_df.head()

Unnamed: 0,Accession,Begin,End,Chromosome,Orientation,Name,Symbol,Gene ID,Gene Type,Transcripts accession,Protein accession,Protein length,Locus tag
0,NC_037328.1,207933,217580,1,minus,ankyrin repeat domain-containing protein 26-like,LOC112447009,112447009,pseudogene,,,,
1,NC_037328.1,228440,283123,1,minus,uncharacterized LOC101903639,LOC101903639,101903639,lncRNA,XR_003035135.2,,,
2,NC_037328.1,284711,290531,1,minus,uncharacterized LOC112447074,LOC112447074,112447074,lncRNA,XR_003035148.2,,,
3,NC_037328.1,328577,347809,1,minus,uncharacterized LOC112447010,LOC112447010,112447010,protein-coding,XM_059890683.1,XP_059746666.1,375.0,
4,NC_037328.1,419848,423189,1,minus,uncharacterized LOC100138661,LOC100138661,100138661,lncRNA,XR_003035143.2,,,


In [None]:
Chr_list=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, "X"]

In [None]:
def hf_qtl(qtl_df, hf_df, chr, out_loc):
  final_df=pd.DataFrame(columns=list(hf_df)+["qtlgene_type"]+["QTL Class"]+["QTL Span1"]+["QTL Span2"]+["Description"])
  # Finding the presence of QTL in Hereford annoted protein
  for i, row in tqdm(qtl_df.iterrows()):
    # Check for the at Begin
    if hf_df[(hf_df["Begin"]==row["QTL Span1"]) & (hf_df["End"]>row["QTL Span2"])].shape[0]>=1:
        tmp_df = hf_df[(hf_df["Begin"]==row["QTL Span1"]) & (hf_df["End"]>row["QTL Span2"])].copy(deep=True)
        tmp_df["qtlgene_type"] = "at_start"
        tmp_df["QTL Class"] = row["QTL Class"]
        tmp_df["QTL Span1"] = row["QTL Span1"]
        tmp_df["QTL Span2"] = row["QTL Span2"]
        tmp_df["Description"] = row["Description"]
        final_df = pd.concat([final_df,tmp_df]).reset_index(drop=True)
    # check for the at end
    elif hf_df[(hf_df["Begin"]<row["QTL Span1"]) & (hf_df["End"]==row["QTL Span2"])].shape[0]>=1:
        tmp_df = hf_df[(hf_df["Begin"]<row["QTL Span1"]) & (hf_df["End"]==row["QTL Span2"])].copy(deep=True)
        tmp_df["qtlgene_type"] = "at_end"
        tmp_df["QTL Class"] = row["QTL Class"]
        tmp_df["QTL Span1"] = row["QTL Span1"]
        tmp_df["QTL Span2"] = row["QTL Span2"]
        tmp_df["Description"] = row["Description"]
        final_df = pd.concat([final_df,tmp_df]).reset_index(drop=True)
    # Check for the within gene
    elif hf_df[(hf_df["Begin"]<row["QTL Span1"]) & (hf_df["End"]>row["QTL Span2"])].shape[0]>=1:
        tmp_df = hf_df[(hf_df["Begin"]<row["QTL Span1"]) & (hf_df["End"]>row["QTL Span2"])].copy(deep=True)
        tmp_df["qtlgene_type"] = "within_gene"
        tmp_df["QTL Class"] = row["QTL Class"]
        tmp_df["QTL Span1"] = row["QTL Span1"]
        tmp_df["QTL Span2"] = row["QTL Span2"]
        tmp_df["Description"] = row["Description"]
        final_df = pd.concat([final_df,tmp_df]).reset_index(drop=True)
    # Check for the near Begin gene with integenic
    elif hf_df[(hf_df["Begin"]>row["QTL Span1"]) & (hf_df["Begin"]<row["QTL Span2"])].shape[0]>=1:
        tmp_df = hf_df[(hf_df["Begin"]>row["QTL Span1"]) & (hf_df["Begin"]<row["QTL Span2"])].copy(deep=True)
        tmp_df["qtlgene_type"] = "near_start_gene with intergenic"
        tmp_df["QTL Class"] = row["QTL Class"]
        tmp_df["QTL Span1"] = row["QTL Span1"]
        tmp_df["QTL Span2"] = row["QTL Span2"]
        tmp_df["Description"] = row["Description"]
        final_df = pd.concat([final_df,tmp_df]).reset_index(drop=True)
    # Check for the near End gene with intergenic
    elif hf_df[(hf_df["End"]>row["QTL Span1"]) & (hf_df["End"]<row["QTL Span2"])].shape[0]>=1:
        tmp_df = hf_df[(hf_df["End"]>row["QTL Span1"]) & (hf_df["End"]<row["QTL Span2"])].copy(deep=True)
        tmp_df["qtlgene_type"] = "near_stop_gene with intergenic"
        tmp_df["QTL Class"] = row["QTL Class"]
        tmp_df["QTL Span1"] = row["QTL Span1"]
        tmp_df["QTL Span2"] = row["QTL Span2"]
        tmp_df["Description"] = row["Description"]
        final_df = pd.concat([final_df,tmp_df]).reset_index(drop=True)
    # Check for the at Begin with intergenic gene
    elif hf_df[(hf_df["Begin"]>row["QTL Span1"]) & (hf_df["Begin"]==row["QTL Span2"])].shape[0]>=1:
        tmp_df = hf_df[(hf_df["Begin"]>row["QTL Span1"]) & (hf_df["Begin"]==row["QTL Span2"])].copy(deep=True)
        tmp_df["qtlgene_type"] = "at_start with intergenic_gene"
        tmp_df["QTL Class"] = row["QTL Class"]
        tmp_df["QTL Span1"] = row["QTL Span1"]
        tmp_df["QTL Span2"] = row["QTL Span2"]
        tmp_df["Description"] = row["Description"]
        final_df = pd.concat([final_df,tmp_df]).reset_index(drop=True)
    # check for the at end with intergenic
    elif hf_df[(hf_df["End"]==row["QTL Span1"]) & (hf_df["End"]<row["QTL Span2"])].shape[0]>=1:
        tmp_df = hf_df[(hf_df["End"]==row["QTL Span1"]) & (hf_df["End"]<row["QTL Span2"])].copy(deep=True)
        tmp_df["qtlgene_type"] = "at_end with intergenic_gene"
        tmp_df["QTL Class"] = row["QTL Class"]
        tmp_df["QTL Span1"] = row["QTL Span1"]
        tmp_df["QTL Span2"] = row["QTL Span2"]
        tmp_df["Description"] = row["Description"]
        final_df = pd.concat([final_df,tmp_df]).reset_index(drop=True)
    # Check for the intergenic gene
    elif hf_df[(hf_df["End"]<row["QTL Span1"]) & (hf_df["hfstart_of_next_gene"]>row["QTL Span2"])].shape[0]>=1:
        tmp_df = hf_df[(hf_df["End"]<row["QTL Span1"]) & (hf_df["hfstart_of_next_gene"]>row["QTL Span2"])].copy(deep=True)
        tmp_df["qtlgene_type"] = "intergenic_gene"
        tmp_df["QTL Class"] = row["QTL Class"]
        tmp_df["QTL Span1"] = row["QTL Span1"]
        tmp_df["QTL Span2"] = row["QTL Span2"]
        tmp_df["Description"] = row["Description"]
        final_df = pd.concat([final_df,tmp_df]).reset_index(drop=True)
    else:
        pass

  # Generate output file
  print("hfqtl", final_df.shape)
  out_file=f"hf_qtl{chr}.csv"
  final_df.to_csv(out_file)
  shutil.copy(out_file, out_loc)

  # remove duplicates
  hfqtl_wd = final_df.drop_duplicates(subset=['Symbol', 'QTL Class', 'QTL Span1', "QTL Span2"])
  print("hfqtl_wd", hfqtl_wd.shape)
  out_file1=f"hf_qtl_wd{chr}.csv"
  hfqtl_wd.to_csv(out_file1)
  shutil.copy(out_file1, out_loc)

  # find variation not present in gene and intergene region called as other non coding region
  final_df1 = pd.DataFrame(columns=hf_df.columns)
  for index, row in tqdm(hf_df.iterrows()):
    # Check if 'startpos.q' is present in any row of file2
    if row['Begin'] not in final_df['Begin'].values:
        # Append the row to the DataFrame if not found
        final_df1 = final_df1.append(row, ignore_index=True)

  # Save the DataFrame with rows not found in file2 to a new CSV file
  out_file2=f"hfnon_qtl{chr}.csv"
  final_df1["QTL Class"]="Nil"
  final_df1.to_csv(out_file2, index=False)
  shutil.copy(out_file2, out_loc)
  print("hf_nonqtl", final_df1.shape)

  # Merge the data has QTL and non QTL
  HF_QTL=pd.DataFrame(columns=list(hfqtl_wd))
  HF_QTL=pd.concat([HF_QTL, hfqtl_wd, final_df1])
  print("HF_QTL", HF_QTL.shape)
  out_file3=f"HF_QTL{chr}.csv"
  HF_QTL.to_csv(out_file3, index=False)
  shutil.copy(out_file3, out_loc)


In [None]:
Chr_list=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, "X"]

In [None]:
for chr in Chr_list:
  in_file1=f"/content/drive/MyDrive/QTL_analysis_051223/QTL_ARS_UCD1_2/qtl_data_Chr.{chr}.txt"
  in_file2=f"/content/drive/MyDrive/Hereford_VarAnalysis/Hereford_Protein/HF{chr}.txt"
  out_loc=f"/content/drive/MyDrive/Hereford_VarAnalysis/Hereford_qtl/{chr}"
  qtl_df=pd.read_csv(in_file1, delimiter="\t", encoding='latin1')
  hf_df=pd.read_csv(in_file2, delimiter="\t")
  hf_df["hfstart_of_next_gene"] = hf_df["Begin"].shift(-1).fillna(0)
  print("hf_df", hf_df.shape)
  hf_qtl(qtl_df, hf_df, chr, out_loc)

hf_df (1643, 14)


3579it [00:39, 90.26it/s]


hfqtl (3657, 19)
hfqtl_wd (2441, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1643it [00:03, 487.40it/s]


hf_nonqtl (994, 15)
HF_QTL (3435, 19)
hf_df (1566, 14)


3895it [00:43, 89.48it/s]


hfqtl (3954, 19)
hfqtl_wd (3010, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1566it [00:03, 497.54it/s]


hf_nonqtl (888, 15)
HF_QTL (3898, 19)
hf_df (2303, 14)


3933it [00:43, 90.63it/s]


hfqtl (4094, 19)
hfqtl_wd (2759, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
2303it [00:05, 398.88it/s]


hf_nonqtl (1540, 15)
HF_QTL (4299, 19)
hf_df (1366, 14)


1819it [00:16, 107.36it/s]


hfqtl (1863, 19)
hfqtl_wd (1357, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1366it [00:02, 520.59it/s]


hf_nonqtl (862, 15)
HF_QTL (2219, 19)
hf_df (2116, 14)


8906it [02:00, 74.15it/s]


hfqtl (9041, 19)
hfqtl_wd (6273, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
2116it [00:05, 384.27it/s]


hf_nonqtl (1241, 15)
HF_QTL (7514, 19)
hf_df (1162, 14)


22533it [07:52, 47.74it/s]


hfqtl (22792, 19)
hfqtl_wd (11782, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1162it [00:02, 510.61it/s]


hf_nonqtl (467, 15)
HF_QTL (12249, 19)
hf_df (2175, 14)


3407it [00:36, 94.61it/s]


hfqtl (3495, 19)
hfqtl_wd (2262, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
2175it [00:06, 335.99it/s]


hf_nonqtl (1500, 15)
HF_QTL (3762, 19)
hf_df (1320, 14)


1822it [00:17, 101.75it/s]


hfqtl (1851, 19)
hfqtl_wd (1392, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1320it [00:02, 534.82it/s]


hf_nonqtl (841, 15)
HF_QTL (2233, 19)
hf_df (996, 14)


1996it [00:19, 101.10it/s]


hfqtl (2015, 19)
hfqtl_wd (1387, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
996it [00:01, 543.06it/s]


hf_nonqtl (573, 15)
HF_QTL (1960, 19)
hf_df (1644, 14)


2830it [00:29, 95.32it/s] 


hfqtl (2888, 19)
hfqtl_wd (1955, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1644it [00:03, 488.34it/s]


hf_nonqtl (1050, 15)
HF_QTL (3005, 19)
hf_df (1611, 14)


5506it [01:03, 86.80it/s]


hfqtl (5606, 19)
hfqtl_wd (3769, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1611it [00:02, 552.11it/s]


hf_nonqtl (852, 15)
HF_QTL (4621, 19)
hf_df (821, 14)


1558it [00:15, 102.24it/s]


hfqtl (1579, 19)
hfqtl_wd (1156, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
821it [00:01, 485.38it/s]


hf_nonqtl (495, 15)
HF_QTL (1651, 19)
hf_df (1383, 14)


3341it [00:33, 100.28it/s]


hfqtl (3465, 19)
hfqtl_wd (2065, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1383it [00:03, 347.69it/s]


hf_nonqtl (846, 15)
HF_QTL (2911, 19)
hf_df (909, 14)


20606it [06:31, 52.66it/s]


hfqtl (21177, 19)
hfqtl_wd (8216, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
909it [00:01, 535.37it/s]


hf_nonqtl (274, 15)
HF_QTL (8490, 19)
hf_df (1780, 14)


2419it [00:22, 105.84it/s]


hfqtl (2469, 19)
hfqtl_wd (1806, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1780it [00:04, 379.52it/s]


hf_nonqtl (1220, 15)
HF_QTL (3026, 19)
hf_df (1177, 14)


2712it [00:26, 100.70it/s]


hfqtl (2754, 19)
hfqtl_wd (1832, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1177it [00:02, 518.29it/s]


hf_nonqtl (748, 15)
HF_QTL (2580, 19)
hf_df (1098, 14)


5511it [01:02, 88.72it/s]


hfqtl (5590, 19)
hfqtl_wd (3809, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1098it [00:01, 591.20it/s]


hf_nonqtl (587, 15)
HF_QTL (4396, 19)
hf_df (1909, 14)


2580it [00:26, 98.60it/s] 


hfqtl (2667, 19)
hfqtl_wd (1662, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1909it [00:04, 426.40it/s]


hf_nonqtl (1361, 15)
HF_QTL (3023, 19)
hf_df (1951, 14)


3643it [00:36, 100.19it/s]


hfqtl (3720, 19)
hfqtl_wd (2240, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1951it [00:06, 321.09it/s]


hf_nonqtl (1295, 15)
HF_QTL (3535, 19)
hf_df (710, 14)


6998it [01:23, 83.56it/s]


hfqtl (7174, 19)
hfqtl_wd (3884, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
710it [00:01, 607.35it/s]


hf_nonqtl (314, 15)
HF_QTL (4198, 19)
hf_df (1170, 14)


3482it [00:36, 95.01it/s]


hfqtl (3526, 19)
hfqtl_wd (2669, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1170it [00:02, 443.41it/s]


hf_nonqtl (829, 15)
HF_QTL (3498, 19)
hf_df (894, 14)


1401it [00:13, 107.64it/s]


hfqtl (1450, 19)
hfqtl_wd (1060, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
894it [00:01, 535.40it/s]


hf_nonqtl (577, 15)
HF_QTL (1637, 19)
hf_df (1501, 14)


1977it [00:17, 110.45it/s]


hfqtl (2030, 19)
hfqtl_wd (1493, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1501it [00:04, 372.19it/s]


hf_nonqtl (1047, 15)
HF_QTL (2540, 19)
hf_df (606, 14)


1045it [00:10, 103.75it/s]


hfqtl (1050, 19)
hfqtl_wd (787, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
606it [00:01, 575.99it/s]


hf_nonqtl (362, 15)
HF_QTL (1149, 19)
hf_df (1139, 14)


3298it [00:33, 99.84it/s]


hfqtl (3387, 19)
hfqtl_wd (2645, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1139it [00:02, 511.68it/s]


hf_nonqtl (746, 15)
HF_QTL (3391, 19)
hf_df (711, 14)


13437it [03:28, 64.59it/s]


hfqtl (13733, 19)
hfqtl_wd (4975, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
711it [00:01, 549.74it/s]


hf_nonqtl (299, 15)
HF_QTL (5274, 19)
hf_df (529, 14)


1550it [00:14, 106.56it/s]


hfqtl (1555, 19)
hfqtl_wd (1190, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
529it [00:00, 583.81it/s]


hf_nonqtl (325, 15)
HF_QTL (1515, 19)
hf_df (570, 14)


926it [00:08, 110.95it/s]


hfqtl (932, 19)
hfqtl_wd (675, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
570it [00:01, 359.56it/s]


hf_nonqtl (348, 15)
HF_QTL (1023, 19)
hf_df (1025, 14)


4956it [00:56, 88.27it/s]


hfqtl (5157, 19)
hfqtl_wd (3291, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1025it [00:02, 483.68it/s]


hf_nonqtl (617, 15)
HF_QTL (3908, 19)
hf_df (1783, 14)


19550it [06:23, 51.00it/s]


hfqtl (19877, 19)
hfqtl_wd (8981, 19)


  final_df1 = final_df1.append(row, ignore_index=True)
1783it [00:04, 421.28it/s]


hf_nonqtl (899, 15)
HF_QTL (9880, 19)


In [None]:
f1=pd.read_csv("/content/drive/MyDrive/Hereford_VarAnalysis/Hereford_qtl/hf_qtl_wd1.csv")
f2=pd.read_csv("/content/drive/MyDrive/Hereford_VarAnalysis/Hereford_qtl/hfnon_qtl1.csv")

In [None]:
f1.shape

(2441, 20)

In [None]:
f2["QTL Class"]= "Nil"

In [None]:
2441+994

3435

In [None]:
f2.shape

(994, 15)

In [None]:
hfqtl=pd.DataFrame(columns=list(f1))

In [None]:
hfqtl=pd.concat([hfqtl, f1, f2])

In [None]:
hfqtl.shape

(3435, 20)

In [None]:
hfqtl.to_csv("HFQTL1.csv")
shutil.copy("HFQTL1.csv", "/content/drive/MyDrive/Hereford_VarAnalysis/Hereford_qtl")

'/content/drive/MyDrive/Hereford_VarAnalysis/Hereford_qtl/HFQTL1.csv'