In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [7]:

# Sample data for dataset1 as a dataframe
data1 = {
    'Column1': ['A', 'B', 'C', 'D', 'E'],
    'Column2': ['F', 'G', 'H', 'I', 'J'],
    'Column3': ['K', 'L', 'M', 'N', 'O'],
    'Column4': ['P', 'Q', 'R', 'S', 'T'],
    'Column5': [None, 'U', 'V', 'W', 'X'],
    'Column6': ['Y', 'Z', None, 'A', 'B']
}

# Sample data for dataset2 as a dataframe
data2 = {
    'Selected_Genes': ['A,B,C', 'X,Y,Z', 'M,N,O', 'P,Q,R', 'G,K']
}

# Create dataframes for dataset1 and dataset2
dataset1 = pd.DataFrame(data1)
dataset2 = pd.DataFrame(data2)
dataset2

Unnamed: 0,Selected_Genes
0,"A,B,C"
1,"X,Y,Z"
2,"M,N,O"
3,"P,Q,R"
4,"G,K"


In [5]:
import pandas as pd

# Sample data for dataset1 as a dataframe
data1 = {
    'Column1': ['A', 'B', 'C', 'D', 'E'],
    'Column2': ['F', 'G', 'H', 'I', 'J'],
    'Column3': ['K', 'L', 'M', 'N', 'O'],
    'Column4': ['P', 'Q', 'R', 'S', 'T'],
    'Column5': [None, 'U', 'V', 'W', 'X'],
    'Column6': ['Y', 'Z', None, 'A', 'B']
}

# Sample data for dataset2 as a dataframe
data2 = {
    'Selected_Genes': ['A,B,C', 'X,Y,Z', 'M,N,O', 'P,Q,R', 'G,K']
}

# Create dataframes for dataset1 and dataset2
dataset1 = pd.DataFrame(data1)
dataset2 = pd.DataFrame(data2)

# Split the selected genes in dataset2 into a list
dataset2['Selected_Genes'] = dataset2['Selected_Genes'].str.split(',')

# Function to check if any gene in dataset2 matches with dataset1
def match_genes(row):
    for gene in row['Selected_Genes']:
        for col in dataset1.columns:
            if pd.notna(row[col]) and gene == row[col]:
                return True
    return False

# Apply the function to filter the rows
filtered_rows = dataset2[dataset2.apply(match_genes, axis=1)]
filtered_rows

KeyError: 'Column1'

In [None]:
import pandas as pd

# Function to read BED file and store genomic regions
def read_bed_file(bed_file):
    regions = {}
    with open(bed_file, 'r') as bed:
        for line in bed:
            chrom, start, end = line.strip().split('\t')
            start, end = int(start), int(end)
            if chrom not in regions:
                regions[chrom] = set()
            for pos in range(start, end + 1):
                regions[chrom].add(pos)
    return regions

# Read the BED file and store genomic regions
bed_file = 'KAPA_HyperExome_hg38_primary_targets_extended.bed'
regions = read_bed_file(bed_file)

# List of VCF files for father, mother, and son
vcf_files = ['KHAIGPRX5_final_father.vcf', 'KHAIGPRX4_RR_final_mother.vcf', 'KHAIGPRX6_final_son.vcf']

for vcf_file in vcf_files:
    # Create an empty list to store the matching variants for each family member
    matching_variants = []
    header_lines = []  # Store header lines from the VCF file

    with open(vcf_file, 'r') as vcf:
        for line in vcf:
            if line.startswith('#'):
                header_lines.append(line)  # Capture header lines
                continue
            columns = line.strip().split('\t')
            chrom, position = columns[0], int(columns[1])
            if chrom in regions and position in regions[chrom]:
                matching_variants.append(columns)

    # Write the matching variants to an output VCF file
    output_filename = f'matching_positions_{vcf_file.split("_")[2]}.vcf'
    
    # Write the header lines first
    with open(output_filename, 'w') as output_vcf:
        for header_line in header_lines:
            output_vcf.write(header_line)
    
    # Write the matching variants
    with open(output_filename, 'a') as output_vcf:
        for variant in matching_variants:
            output_vcf.write('\t'.join(variant) + '\n')

In [44]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/srinivas_sir_covered.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/sen_spe_files_07_09_2023/VCC_input_files/new_input_sample/new_bwa_files/12652712_HAPLOTYPECALLER.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/VCC_NEW/12652712_HAPLOTYPECALLER.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()

In [17]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/sen_spe_files_07_09_2023/VCC_input_files/new_input_sample/new_bwa_files/12652712_BCFTOOL.vcf', comment= '#', sep = '\t', header=None, low_memory=False, encoding='latin-1')
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,69511,rs2691305,A,G,225.4170,.,"DP=249;ADF=0,168;ADR=0,53;AD=0,221;DPR=0,221;S...",GT:PL:DP:DV:SP:DP4:ADF:ADR:AD:DPR:SCR,"1/1:255,255,0:221:221:0:0,0,168,53:0,168:0,53:..."
1,chr1,785910,rs12565286,G,C,115.8900,.,"DP=83;ADF=51,27;ADR=2,0;AD=53,27;DPR=53,27;SCR...",GT:PL:DP:DV:SP:DP4:ADF:ADR:AD:DPR:SCR,"0/1:149,0,244:80:27:3:51,2,27,0:51,27:2,0:53,2..."
2,chr1,786070,rs2977675,G,A,172.8870,.,"DP=226;ADF=87,25;ADR=51,37;AD=138,62;DPR=138,6...",GT:PL:DP:DV:SP:DP4:ADF:ADR:AD:DPR:SCR,"0/1:207,0,255:200:62:25:87,51,25,37:87,25:51,3..."
3,chr1,786377,rs189147642,T,A,35.6919,.,"DP=119;ADF=43,11;ADR=49,10;AD=92,21;DPR=92,21;...",GT:PL:DP:DV:SP:DP4:ADF:ADR:AD:DPR:SCR,"0/1:71,0,255:113:21:1:43,49,11,10:43,11:49,10:..."
4,chr1,817416,rs148649543,C,T,212.0630,.,"DP=235;ADF=133,55;ADR=8,9;AD=141,64;DPR=141,64...",GT:PL:DP:DV:SP:DP4:ADF:ADR:AD:DPR:SCR,"0/1:245,0,255:205:64:13:133,8,55,9:133,55:8,9:..."
...,...,...,...,...,...,...,...,...,...,...
82677,chrY,11986256,rs9650860,T,C,65.4148,.,"DP=10;ADF=0,9;ADR=0,0;AD=0,9;DPR=0,9;SCR=0;VDB...",GT:PL:DP:DV:SP:DP4:ADF:ADR:AD:DPR:SCR,"1/1:95,27,0:9:9:0:0,0,9,0:0,9:0,0:0,9:0,9:0"
82678,chrY,11986362,rs9650861,A,C,107.4150,.,"DP=38;ADF=0,35;ADR=0,0;AD=0,35;DPR=0,35;SCR=0;...",GT:PL:DP:DV:SP:DP4:ADF:ADR:AD:DPR:SCR,"1/1:137,105,0:35:35:0:0,0,35,0:0,35:0,0:0,35:0..."
82679,chrY,11986608,rs879016544,T,C,110.4150,.,"DP=33;ADF=0,0;ADR=0,30;AD=0,30;DPR=0,30;SCR=1;...",GT:PL:DP:DV:SP:DP4:ADF:ADR:AD:DPR:SCR,"1/1:140,90,0:30:30:0:0,0,0,30:0,0:0,30:0,30:0,..."
82680,chrY,11986732,rs867343413,C,T,168.4160,.,"DP=19;ADF=0,3;ADR=0,14;AD=0,17;DPR=0,17;SCR=0;...",GT:PL:DP:DV:SP:DP4:ADF:ADR:AD:DPR:SCR,"1/1:198,51,0:17:17:0:0,0,3,14:0,3:0,14:0,17:0,..."


In [18]:
vcf.ALT.value_counts()

G      21020
C      20713
A      20548
T      20379
C,G        3
C,T        3
G,A        3
A,C        2
T,C        2
T,A        2
G,T        2
G,C        2
A,G        1
A,T        1
C,A        1
Name: ALT, dtype: int64

In [8]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Covered_father_new.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf['gnomADg_AF'] = vcf['CSQ'].str.split('|').str[57].replace('', 0)
vcf['gnomADg_SAS_AF'] = vcf['CSQ'].str.split('|').str[67].replace('', 0)
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR', 'DP', 'gnomADg_AF', 'gnomADg_SAS_AF']]
vcf.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Father_data_columns.xlsx', index = False)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,DP,gnomADg_AF,gnomADg_SAS_AF
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=140;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,181,140,88,52,37.14%,6.3743E-19,59,49,77,11,43,9,140,0.6291,0.8855
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=147;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,147,1,146,99.32%,1.0001E-85,37,55,1,0,115,31,147,0.846,0.9772
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=137;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0...",0/1,127,137,99,38,27.74%,1.8385E-13,56,50,63,36,27,11,137,0.4864,0.622
3,chr1,924533,rs112703963,A,G,.,PASS,"ADP=93;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,255,93,0,93,100%,1.7451E-55,0,54,0,0,70,23,93,0.8215,0.9098
4,chr1,942451,rs6672356,T,C,.,PASS,"ADP=80;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",1/1,255,80,0,80,100%,1.0864E-47,0,51,0,0,54,26,80,0.9998,0.9996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31681,chrX,155383098,.,T,C,.,PASS,ADP=87;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,95,87,59,28,32.18%,2.7793E-10,52,61,46,13,24,4,87,0,0
31682,chrX,156010159,rs200413398,A,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.858,0.1...",0/1,85,33,11,21,65.62%,3.1387E-9,56,52,8,3,17,4,32,0.1656,0.2871
31683,chrX,156010162,rs150178903,A,G,.,PASS,"ADP=33;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8956,0....",0/1,78,34,13,20,60.61%,1.4096E-8,53,54,9,4,16,4,33,0.1527,0.2672
31684,chrY,3019783,rs9786184,A,C,.,PASS,"ADP=39;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1744,0....",1/1,224,39,0,39,100%,3.6742E-23,0,63,0,0,33,6,39,0.658,0.9843


In [9]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Covered_mother_new.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf['gnomADg_AF'] = vcf['CSQ'].str.split('|').str[57].replace('', 0)
vcf['gnomADg_SAS_AF'] = vcf['CSQ'].str.split('|').str[67].replace('', 0)
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR', 'DP', 'gnomADg_AF', 'gnomADg_SAS_AF']]
vcf.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Mother_data_columns.xlsx', index = False)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,DP,gnomADg_AF,gnomADg_SAS_AF
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=44;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEIN...,0/1,112,44,16,28,63.64%,5.6997E-12,66,47,15,1,22,6,44,0.6291,0.8855
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=133;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,133,0,133,100%,1.7256E-79,0,55,0,0,108,25,133,0.846,0.9772
2,chr1,69761,rs200505207,A,T,.,PASS,"ADP=66;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9633,0....",0/1,60,66,48,18,27.27%,9.974E-7,57,61,35,13,14,4,66,0.06088,0.04502
3,chr1,69897,rs200676709,T,C,.,PASS,"ADP=46;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,59,46,29,17,36.96%,1.2413E-6,62,59,22,7,15,2,46,0.4864,0.622
4,chr1,924533,rs112703963,A,G,.,PASS,"ADP=40;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,214,40,1,39,97.5%,3.8137E-22,62,56,1,0,30,9,40,0.8215,0.9098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29803,chrX,154653251,rs17328091,C,G,.,PASS,"ADP=41;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.4906,0....",0/1,51,41,26,15,36.59%,6.364E-6,44,52,26,0,15,0,41,0.4953,0.4208
29804,chrX,154653499,rs17855367,C,T,.,PASS,"ADP=72;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5428,.,...",0/1,78,72,49,23,31.94%,1.4681E-8,61,46,37,12,8,15,72,0.4428,0.3835
29805,chrX,154766321,rs2728532,G,T,.,PASS,"ADP=44;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.007417,...",1/1,248,44,0,43,97.73%,1.5066E-25,0,55,0,0,33,10,44,0.9924,1
29806,chrX,154792236,rs782318569,C,T,.,PASS,"ADP=47;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9995,0....",0/1,120,47,17,30,63.83%,8.4848E-13,47,48,13,4,19,11,47,5.344e-05,0.001475


In [10]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Covered_son_new.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf['gnomADg_AF'] = vcf['CSQ'].str.split('|').str[57].replace('', 0)
vcf['gnomADg_SAS_AF'] = vcf['CSQ'].str.split('|').str[67].replace('', 0)
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR', 'DP', 'gnomADg_AF', 'gnomADg_SAS_AF']]
vcf.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Son_data_columns.xlsx', index = False)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,DP,gnomADg_AF,gnomADg_SAS_AF
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=104;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,255,104,30,74,71.15%,3.2093E-32,58,49,21,9,59,15,104,0.6291,0.8855
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=205;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,205,0,205,100%,9.6033E-123,0,54,0,0,145,60,205,0.846,0.9772
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=93;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,255,93,28,65,69.89%,4.0051E-28,53,51,15,13,49,16,93,0.4864,0.622
3,chr1,924533,rs112703963,A,G,.,PASS,"ADP=81;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,255,81,0,80,98.77%,1.0864E-47,0,52,0,0,48,32,81,0.8215,0.9098
4,chr1,942451,rs6672356,T,C,.,PASS,"ADP=98;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",1/1,255,98,0,98,100%,1.7493E-58,0,48,0,0,66,32,98,0.9998,0.9996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32709,chrX,155277884,rs559165,G,T,.,PASS,"ADP=28;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6458,0....",1/1,158,28,0,28,100%,1.3074E-16,0,54,0,0,25,3,28,0.6616,0.7176
32710,chrX,156003433,rs2037999,T,C,.,PASS,"ADP=9;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.4079,0.5...",0/1,23,9,3,6,66.67%,4.5249E-3,61,67,3,0,6,0,9,0.5073,0.5465
32711,chrY,3019783,rs9786184,A,C,.,PASS,"ADP=42;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1744,0....",1/1,242,42,0,42,100%,5.9562E-25,0,54,0,0,31,11,42,0.658,0.9843
32712,chrY,12914512,rs2032624,C,A,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.7575,0....",1/1,99,18,0,18,100%,1.1019E-10,0,56,0,0,15,3,18,0.6,0.6547


In [8]:
import pandas as pd

# Creating a sample DataFrame with duplicate column names
data = {'Header1': ['kavya', 'rishika', 'shyam'],
        'Header1': ['A', 'B', 'C'],
        'Header2': ['bob', 'dc', 'ad'],
        'Header2': ['X', 'Y', 'Z']}
df = pd.DataFrame(data)

# Displaying the original DataFrame
print("Original DataFrame:")
print(df)

# Concatenating the header names with their respective first row values and updating the DataFrame
new_columns = []
for col in df.columns:
    header_name = col
    first_row_value = df.loc[0, col]
    new_name = f"{header_name}_{first_row_value}"
    
    # Handle duplicates by adding an index
    index = 1
    while new_name in new_columns:
        index += 1
        new_name = f"{header_name}_{first_row_value}_{index}"

    new_columns.append(new_name)
    df.rename(columns={col: new_name}, inplace=True)

# Deleting the first row
df = df.drop(0)

# Displaying the DataFrame in the desired format
print("\nDataFrame in the desired format:")
print(df.to_string(index=False))
df

Original DataFrame:
  Header1 Header2
0       A       X
1       B       Y
2       C       Z

DataFrame in the desired format:
Header1_A Header2_X
        B         Y
        C         Z


Unnamed: 0,Header1_A,Header2_X
1,B,Y
2,C,Z


In [10]:
import pandas as pd

# Creating a sample DataFrame with duplicate column names
data = [
    {'Header1': 'kavya', 'Header2': 'bob', 'Header3': 'X'},
    {'Header1': 'A', 'Header2': 'Y', 'Header3': 'Z'},
    {'Header1': 'rishika', 'Header2': 'dc', 'Header3': 'Y'},
    {'Header1': 'B', 'Header2': 'Z', 'Header3': 'X'},
    {'Header1': 'shyam', 'Header2': 'ad', 'Header3': 'Z'},
    {'Header1': 'C', 'Header2': 'X', 'Header3': 'Y'}
]

df = pd.DataFrame(data)

# Displaying the original DataFrame
print("Original DataFrame:")
# Concatenating the header names with their respective first row values and updating the DataFrame
new_columns = []
for col in df.columns:
    header_name = col
    first_row_value = df.loc[0, col]
    new_name = f"{header_name}_{first_row_value}"
    
    # Handle duplicates by adding an index
    index = 1
    while new_name in new_columns:
        index += 1
        new_name = f"{header_name}_{first_row_value}_{index}"

    new_columns.append(new_name)
    df.rename(columns={col: new_name}, inplace=True)

# Deleting the first row
df = df.drop(0)

# Displaying the DataFrame in the desired format
print("\nDataFrame in the desired format:")
print(df.to_string(index=False))
df

Original DataFrame:

DataFrame in the desired format:
Header1_kavya Header2_bob Header3_X
            A           Y         Z
      rishika          dc         Y
            B           Z         X
        shyam          ad         Z
            C           X         Y


Unnamed: 0,Header1_kavya,Header2_bob,Header3_X
1,A,Y,Z
2,rishika,dc,Y
3,B,Z,X
4,shyam,ad,Z
5,C,X,Y
