In [None]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [6]:
################################# Import the required Libraries ############################################################

import numpy as np
import pandas as pd
import sys
import re
import os

################################### Import the VCF data and assign the column names ########################################

vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/KHHSPTGPONC63_Mutect2_noindels.vcf', comment= '#', sep = '\t', header=None, low_memory=False, encoding='latin-1')
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']

################################### Extract the Zygosity of the variants from the Sample column of the vcf file ############

vcf['Zygosity'] = vcf['SAMPLE'].str.split(':').str[0]
vcf['DP'] = vcf['SAMPLE'].str.split(':').str[3].astype('int')
#vcf = vcf[vcf['DP'] >= 5]

# Iterate through each row in the DataFrame
for index, row in vcf.iterrows():
    # Check if the value is '1/1', if yes, set it to 'Homozygous', else set it to 'Heterozygous'
    vcf.at[index, 'Zygosity'] = 'Homozygous' if row['Zygosity'] == '1/1' else 'Heterozygous'
    
###################### Extract the Gene Name from the INFO column (CSQ) ###################################################

# Define a regular expression to extract the gene name from the CSQ field
gene_regex = r'([\w.-]+)\|(ENSG\d+|ENSG-\d+[A-Z]?\.\d+)\|'

# Define a function to extract the gene name(s) from the INFO column
def extract_gene_name(info):
    csq_field = [field for field in info.split(';') if field.startswith('CSQ=')]
    if csq_field:
        csq_field = csq_field[0]
        matches = re.findall(gene_regex, csq_field)
        if len(set(matches)) > 1:
            return ','.join(set([match[0] for match in matches]))
        elif matches:
            return matches[0][0]
    return 'NA'

vcf['Gene Name'] = vcf['INFO'].apply(extract_gene_name)

################################## Create the DateFrame with KDR gene in the Gene Name column #############################

data = {'Gene Name': ['CIDEB']}
df_gene = pd.DataFrame(data)

################################# Map the KDR gene with the extracted gene column in the DataFrame ########################

vcf['Gene_Match'] = 'No'
# Iterate through each gene in vcf
for genes in vcf['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in df_gene['Gene Name'].values for gene in gene_list) # Check if any gene in the list exists in df2
        if match:
            vcf.loc[vcf['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
vcf = vcf[vcf['Gene_Match'] == 'Yes']

###################### Finally pool the required columns from the vcf file and explort the dataframe as excel file #######

vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'Gene Name', 'Zygosity', 'DP']]
#vcf.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/KHAIGHGP0NC8_Haplotype_noindels_updated.xlsx', index=False)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene Name,Zygosity,DP
326917,chr14,24301931,.,G,A,"DHRS1,CIDEB,LTB4R2,NOP9",Heterozygous,2
326918,chr14,24302473,.,T,A,"DHRS1,CIDEB,LTB4R2,NOP9",Heterozygous,1
326919,chr14,24302637,.,A,G,"DHRS1,CIDEB,LTB4R2,NOP9",Heterozygous,2
326920,chr14,24302733,.,T,C,"DHRS1,CIDEB,LTB4R2,NOP9",Heterozygous,1
326921,chr14,24306640,.,C,T,"LTB4R2,CIDEB,LTB4R,NOP9",Heterozygous,3
326922,chr14,24308205,.,A,C,"LTB4R2,CIDEB,LTB4R,NOP9",Heterozygous,5
326923,chr14,24308220,.,T,A,"LTB4R2,CIDEB,LTB4R,NOP9",Heterozygous,4
326924,chr14,24312106,.,C,T,"LTB4R2,CIDEB,LTB4R,NOP9",Heterozygous,2
326925,chr14,24312124,.,G,A,"LTB4R2,CIDEB,LTB4R,NOP9",Heterozygous,2
326926,chr14,24314911,.,C,T,"LTB4R,CIDEB,ADCY4,LTB4R2",Heterozygous,1


In [7]:
vcf.shape

(10, 8)

In [15]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/MAF_KDR Known variants_287.xlsx')
df

Unnamed: 0,VARIANT,HOM,HET,COUNT,HOM WILD,Ref_Allele_count,Alt_Allele_count,Totaol_Allele_count,Ref_Allele_Freq,KH_MAF,Prevalence,CHROM,POS,REF,ALT,Ncid
0,chr4_55076834_T_C,1.0,0,1,113,226,2,228,0.991228,0.008772,0.877193,chr4,55076834,T,C,NC_000004.12:g.55076834T>C
1,chr4_55077777_T_G,1.0,0,1,113,226,2,228,0.991228,0.008772,0.877193,chr4,55077777,T,G,NC_000004.12:g.55077777T>G
2,chr4_55077783_G_A,1.0,0,1,113,226,2,228,0.991228,0.008772,0.877193,chr4,55077783,G,A,NC_000004.12:g.55077783G>A
3,chr4_55077786_T_G,1.0,0,1,113,226,2,228,0.991228,0.008772,0.877193,chr4,55077786,T,G,NC_000004.12:g.55077786T>G
4,chr4_55077790_T_G,1.0,0,1,113,226,2,228,0.991228,0.008772,0.877193,chr4,55077790,T,G,NC_000004.12:g.55077790T>G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,chr4_55127139_G_A,23.0,0,23,91,182,46,228,0.798246,0.201754,20.175439,chr4,55127139,G,A,NC_000004.12:g.55127139G>A
283,chr4_55127146_T_A,1.0,0,1,113,226,2,228,0.991228,0.008772,0.877193,chr4,55127146,T,A,NC_000004.12:g.55127146T>A
284,chr4_55127162_CC_GG,1.0,0,1,113,226,2,228,0.991228,0.008772,0.877193,chr4,55127162,CC,GG,NC_000004.12:g.55127162CC>GG
285,chr4_55127210_T_A,1.0,0,1,113,226,2,228,0.991228,0.008772,0.877193,chr4,55127210,T,A,NC_000004.12:g.55127210T>A


In [14]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/263_KDR.vcf', comment= '#', sep = '\t', header=None, low_memory=False, encoding='latin-1')
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
vcf['CHROM'] = 'chr' + vcf['CHROM'].astype(str)
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf['Consequence'] = vcf['CSQ'].str.split('|').str[1]
vcf['IMPACT'] = vcf['CSQ'].str.split('|').str[2]
vcf['EXON'] = vcf['CSQ'].str.split('|').str[8]
vcf['INTRON'] = vcf['CSQ'].str.split('|').str[9]
vcf['PolyPhen'] = vcf['CSQ'].str.split('|').str[30]
vcf['SIFT'] = vcf['CSQ'].str.split('|').str[29]
vcf = vcf[['CHROM', 'POS', 'REF', 'ALT', 'Consequence', 'IMPACT', 'EXON', 'INTRON', 'PolyPhen', 'SIFT']]
vcf

Unnamed: 0,CHROM,POS,REF,ALT,Consequence,IMPACT,EXON,INTRON,PolyPhen,SIFT
0,chr4,55076834,T,C,intron_variant&non_coding_transcript_variant,MODIFIER,,3/3,,
1,chr4,55077777,T,G,intron_variant&non_coding_transcript_variant,MODIFIER,,3/3,,
2,chr4,55077783,G,A,intron_variant&non_coding_transcript_variant,MODIFIER,,3/3,,
3,chr4,55077786,T,G,intron_variant&non_coding_transcript_variant,MODIFIER,,3/3,,
4,chr4,55077790,T,G,intron_variant&non_coding_transcript_variant,MODIFIER,,3/3,,
...,...,...,...,...,...,...,...,...,...,...
258,chr4,55127121,G,T,intergenic_variant,MODIFIER,,,,
259,chr4,55127139,G,A,intergenic_variant,MODIFIER,,,,
260,chr4,55127146,T,A,intergenic_variant,MODIFIER,,,,
261,chr4,55127210,T,A,intergenic_variant,MODIFIER,,,,


In [16]:
merged = pd.merge(df, vcf, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'left', sort=False)
merged

Unnamed: 0,VARIANT,HOM,HET,COUNT,HOM WILD,Ref_Allele_count,Alt_Allele_count,Totaol_Allele_count,Ref_Allele_Freq,KH_MAF,...,POS,REF,ALT,Ncid,Consequence,IMPACT,EXON,INTRON,PolyPhen,SIFT
0,chr4_55076834_T_C,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55076834,T,C,NC_000004.12:g.55076834T>C,intron_variant&non_coding_transcript_variant,MODIFIER,,3/3,,
1,chr4_55077777_T_G,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55077777,T,G,NC_000004.12:g.55077777T>G,intron_variant&non_coding_transcript_variant,MODIFIER,,3/3,,
2,chr4_55077783_G_A,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55077783,G,A,NC_000004.12:g.55077783G>A,intron_variant&non_coding_transcript_variant,MODIFIER,,3/3,,
3,chr4_55077786_T_G,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55077786,T,G,NC_000004.12:g.55077786T>G,intron_variant&non_coding_transcript_variant,MODIFIER,,3/3,,
4,chr4_55077790_T_G,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55077790,T,G,NC_000004.12:g.55077790T>G,intron_variant&non_coding_transcript_variant,MODIFIER,,3/3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,chr4_55127139_G_A,23.0,0,23,91,182,46,228,0.798246,0.201754,...,55127139,G,A,NC_000004.12:g.55127139G>A,intergenic_variant,MODIFIER,,,,
283,chr4_55127146_T_A,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55127146,T,A,NC_000004.12:g.55127146T>A,intergenic_variant,MODIFIER,,,,
284,chr4_55127162_CC_GG,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55127162,CC,GG,NC_000004.12:g.55127162CC>GG,,,,,,
285,chr4_55127210_T_A,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55127210,T,A,NC_000004.12:g.55127210T>A,intergenic_variant,MODIFIER,,,,


In [19]:
merged.fillna('-', inplace=True)
merged.replace('', '-', inplace=True)
merged

Unnamed: 0,VARIANT,HOM,HET,COUNT,HOM WILD,Ref_Allele_count,Alt_Allele_count,Totaol_Allele_count,Ref_Allele_Freq,KH_MAF,...,POS,REF,ALT,Ncid,Consequence,IMPACT,EXON,INTRON,PolyPhen,SIFT
0,chr4_55076834_T_C,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55076834,T,C,NC_000004.12:g.55076834T>C,intron_variant&non_coding_transcript_variant,MODIFIER,-,3/3,-,-
1,chr4_55077777_T_G,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55077777,T,G,NC_000004.12:g.55077777T>G,intron_variant&non_coding_transcript_variant,MODIFIER,-,3/3,-,-
2,chr4_55077783_G_A,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55077783,G,A,NC_000004.12:g.55077783G>A,intron_variant&non_coding_transcript_variant,MODIFIER,-,3/3,-,-
3,chr4_55077786_T_G,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55077786,T,G,NC_000004.12:g.55077786T>G,intron_variant&non_coding_transcript_variant,MODIFIER,-,3/3,-,-
4,chr4_55077790_T_G,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55077790,T,G,NC_000004.12:g.55077790T>G,intron_variant&non_coding_transcript_variant,MODIFIER,-,3/3,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,chr4_55127139_G_A,23.0,0,23,91,182,46,228,0.798246,0.201754,...,55127139,G,A,NC_000004.12:g.55127139G>A,intergenic_variant,MODIFIER,-,-,-,-
283,chr4_55127146_T_A,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55127146,T,A,NC_000004.12:g.55127146T>A,intergenic_variant,MODIFIER,-,-,-,-
284,chr4_55127162_CC_GG,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55127162,CC,GG,NC_000004.12:g.55127162CC>GG,-,-,-,-,-,-
285,chr4_55127210_T_A,1.0,0,1,113,226,2,228,0.991228,0.008772,...,55127210,T,A,NC_000004.12:g.55127210T>A,intergenic_variant,MODIFIER,-,-,-,-


In [20]:
merged.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/287_rsID_CON_IMP.xlsx', index=False)

In [41]:
import os
import pandas as pd

# Set the path to the folder containing your Excel files
folder_path = r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/KDR_112_samples_Mutect2_outputs/Mutect2_outputs/'

# Initialize an empty list to store individual DataFrames
dfs = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".xlsx"):
        file_path = os.path.join(folder_path, filename)
        
        # Read the Excel file into a DataFrame
        df = pd.read_excel(file_path)
        
        # Extract the "Sample" value from the base name of the file
        sample_name = os.path.splitext(filename)[0]
        
        # Add a new column "Sample" with the extracted value
        df['Sample'] = sample_name
                
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames in the list
concatenated_data = pd.concat(dfs, ignore_index=True)

# Save the final DataFrame to a new Excel file
output_file_path = r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/KDR_112_samples_Mutect2_outputs/Mutect2_outputs/All_MUTECT2_concatenated_data.xlsx'
concatenated_data.to_excel(output_file_path, index=False)

# Print the concatenated DataFrame
concatenated_data

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene Name,Zygosity,DP,Sample
0,chr4,55079914,.,A,G,KDR,Heterozygous,53,KHAIGHGP0NC8_Mutect2_output
1,chr4,55081941,.,A,G,KDR,Heterozygous,53,KHAIGHGP0NC8_Mutect2_output
2,chr4,55089065,.,T,C,KDR,Heterozygous,24,KHAIGHGP0NC8_Mutect2_output
3,chr4,55101886,.,A,C,KDR,Heterozygous,160,KHAIGHGP0NC8_Mutect2_output
4,chr4,55105679,.,G,T,KDR,Heterozygous,6,KHAIGHGP0NC8_Mutect2_output
...,...,...,...,...,...,...,...,...,...
1533,chr4,55118982,.,A,G,KDR,Heterozygous,32,KHULTRGPONC1_Mutect2_output
1534,chr4,55119042,.,C,G,KDR,Heterozygous,17,KHULTRGPONC1_Mutect2_output
1535,chr4,55120835,.,G,T,KDR,Heterozygous,14,KHULTRGPONC1_Mutect2_output
1536,chr4,55121546,.,T,C,KDR,Heterozygous,6,KHULTRGPONC1_Mutect2_output


In [44]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/KDR_112_samples_Mutect2_outputs/Mutect2_outputs/All_MUTECT2_concatenated_data.xlsx')
df

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene Name,Zygosity,DP,Sample,Variant
0,chr4,55079914,.,A,G,KDR,Heterozygous,53,KHAIGHGP0NC8,chr4_55079914_A_G
1,chr4,55081941,.,A,G,KDR,Heterozygous,53,KHAIGHGP0NC8,chr4_55081941_A_G
2,chr4,55089065,.,T,C,KDR,Heterozygous,24,KHAIGHGP0NC8,chr4_55089065_T_C
3,chr4,55101886,.,A,C,KDR,Heterozygous,160,KHAIGHGP0NC8,chr4_55101886_A_C
4,chr4,55105679,.,G,T,KDR,Heterozygous,6,KHAIGHGP0NC8,chr4_55105679_G_T
...,...,...,...,...,...,...,...,...,...,...
1533,chr4,55118982,.,A,G,KDR,Heterozygous,32,KHULTRGPONC1,chr4_55118982_A_G
1534,chr4,55119042,.,C,G,KDR,Heterozygous,17,KHULTRGPONC1,chr4_55119042_C_G
1535,chr4,55120835,.,G,T,KDR,Heterozygous,14,KHULTRGPONC1,chr4_55120835_G_T
1536,chr4,55121546,.,T,C,KDR,Heterozygous,6,KHULTRGPONC1,chr4_55121546_T_C


In [45]:
# Group by specified columns and aggregate using a dictionary
agg_dict = {'Sample': lambda x: ','.join(x.unique())}

result_df = df.groupby(['Variant', 'Zygosity']).agg(agg_dict).reset_index()
result_df['Sample_count'] = result_df['Sample'].apply(lambda x: len(x.split(',')))

# Print the result
result_df

Unnamed: 0,Variant,Zygosity,Sample,Sample_count
0,chr4_55076834_T_C,Heterozygous,KHHSPTGPONC33,1
1,chr4_55077777_T_G,Heterozygous,KHHSPTGPONC89,1
2,chr4_55077783_G_A,Heterozygous,KHHSPTGPONC91,1
3,chr4_55077786_T_G,Heterozygous,KHHSPTGPONC89,1
4,chr4_55077790_T_G,Heterozygous,KHHSPTGPONC89,1
...,...,...,...,...
257,chr4_55127121_G_T,Heterozygous,KHAIGHGPONC5,1
258,chr4_55127139_G_A,Heterozygous,"KHAIGHGPONC12,KHAIGHGPONC16,KHAIGHGPONC17,KHAI...",23
259,chr4_55127146_T_A,Heterozygous,KHAIGHGPONC18,1
260,chr4_55127162_CC_GG,Heterozygous,KHHSPTGPONC37,1


In [46]:
result_df.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/KDR_112_samples_Mutect2_outputs/Mutect2_outputs/Grouped_MUTECT2_variant_counts.xlsx', index=False)

In [59]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/Variant_Zygosity_sample_data.xlsx')
df

Unnamed: 0,Concatenate,Zygosity
0,chr4_55079914_A_G_KHAIGHGPONC1,Heterozygous
1,chr4_55081901_G_A_KHAIGHGPONC1,Heterozygous
2,chr4_55081941_A_G_KHAIGHGPONC1,Heterozygous
3,chr4_55089065_T_C_KHAIGHGPONC1,Heterozygous
4,chr4_55101639_A_G_KHAIGHGPONC1,Heterozygous
...,...,...
2946,chr4_55118982_A_G_KHULTRGPONC1,Homozygous
2947,chr4_55119042_C_G_KHULTRGPONC1,Heterozygous
2948,chr4_55120835_G_T_KHULTRGPONC1,Heterozygous
2949,chr4_55121546_T_C_KHULTRGPONC1,Heterozygous


In [60]:
df = df.drop_duplicates(subset=['Concatenate'], keep='first')
df['CHROM'] = df['Concatenate'].str.split('_').str[0]
df['POS'] = df['Concatenate'].str.split('_').str[1]
df['REF'] = df['Concatenate'].str.split('_').str[2]
df['ALT'] = df['Concatenate'].str.split('_').str[3]
df['Sample'] = df['Concatenate'].str.split('_').str[4]
df['Variant'] = df['CHROM'] + "_" + df['POS'] + "_" + df['REF'] + "_" + df['ALT']
df = df[['Variant', 'Zygosity', 'Sample']]
df

Unnamed: 0,Variant,Zygosity,Sample
0,chr4_55079914_A_G,Heterozygous,KHAIGHGPONC1
1,chr4_55081901_G_A,Heterozygous,KHAIGHGPONC1
2,chr4_55081941_A_G,Heterozygous,KHAIGHGPONC1
3,chr4_55089065_T_C,Heterozygous,KHAIGHGPONC1
4,chr4_55101639_A_G,Heterozygous,KHAIGHGPONC1
...,...,...,...
2932,chr4_55118982_A_G,Heterozygous,KHULTRGPONC1
2933,chr4_55119042_C_G,Heterozygous,KHULTRGPONC1
2934,chr4_55120835_G_T,Heterozygous,KHULTRGPONC1
2935,chr4_55121546_T_C,Heterozygous,KHULTRGPONC1


In [61]:
# Group by specified columns and aggregate using a dictionary
agg_dict = {'Sample': lambda x: ','.join(x.unique())}

result_df = df.groupby(['Variant', 'Zygosity']).agg(agg_dict).reset_index()
result_df['Sample_count'] = result_df['Sample'].apply(lambda x: len(x.split(',')))

# Print the result
result_df

Unnamed: 0,Variant,Zygosity,Sample,Sample_count
0,chr4_55076834_T_C,Heterozygous,KHHSPTGPONC33,1
1,chr4_55077777_T_G,Heterozygous,KHHSPTGPONC89,1
2,chr4_55077783_G_A,Heterozygous,KHHSPTGPONC91,1
3,chr4_55077786_T_G,Heterozygous,KHHSPTGPONC89,1
4,chr4_55077790_T_G,Heterozygous,KHHSPTGPONC89,1
...,...,...,...,...
294,chr4_55127139_G_A,Heterozygous,"KHAIGHGPONC12,KHAIGHGPONC16,KHAIGHGPONC17,KHAI...",23
295,chr4_55127146_T_A,Heterozygous,KHAIGHGPONC18,1
296,chr4_55127162_CC_GG,Heterozygous,KHHSPTGPONC37,1
297,chr4_55127210_T_A,Heterozygous,KHHSPTGPONC89,1


In [62]:
result_df.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/Final_counts_updated.xlsx', index=False)

# Concatenate the 1,2,4,5 columns in my excel files.

In [34]:
import os
import pandas as pd

def concatenate_columns(file_path):
    # Read the Excel file into a DataFrame
    df = pd.read_excel(file_path)

    # Concatenate the specified columns with "_"
    concatenated_column = df.iloc[:, [0, 1, 3, 4]].astype(str).apply('_'.join, axis=1)

    # Add the new concatenated column to the DataFrame
    df['Variant'] = concatenated_column

    # Save the updated DataFrame back to the Excel file
    df.to_excel(file_path, index=False)

if __name__ == "__main__":
    # Specify the directory containing your Excel files
    directory_path = r'C:/Users/GenepoweRx_Madhu/Downloads/KDR_MAF/KDR_113_samples_Haplo_outputs/Haplo_outputs/'

    # Loop through each file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".xlsx"):
            file_path = os.path.join(directory_path, filename)
            concatenate_columns(file_path)
print('Data files created with the new variant concatenated column')

ValueError: Cannot set a DataFrame with multiple columns to the single column Variant

In [44]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/ONCO_samples/KHAIGHGP0NC7_MINI.Mutect2_filtered_annotated.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
#vcf['VC'] = vcf['INFO'].str.extract('VC=([^;]+)')
# Filter rows where 'VC' is equal to 'SNV'
#vcf = vcf[vcf['VC'] == 'SNV']
vcf["Gene Name"] = vcf["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
vcf['Gene Name'] = vcf['Gene Name'].apply(lambda x: ','.join(set([segment.split(':')[0] for segment in x.split('|')])) if pd.notnull(x) else '')
vcf['Zygosity'] = vcf['SAMPLE'].str.split(':').str[0]
data = {'Gene Name': ['KDR']}
df_gene = pd.DataFrame(data)
vcf['Gene_Match'] = 'No'
# Iterate through each gene in vcf
for genes in vcf['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in df_gene['Gene Name'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            vcf.loc[vcf['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
vcf = vcf[vcf['Gene_Match'] == 'Yes']
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'Gene Name', 'Zygosity']]
vcf['Zygosity'] = vcf['Zygosity'].replace({'1/1': 'Homozygous', '0/1': 'Heterozygous', '0|1': 'Heterozygous', '1|1': 'Homozygous'})
#vcf.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/ONCO_samples/KHAIGHGP0NC7_MINI.Mutect2_filtered_annotated.xlsx', index=False)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene Name,Zygosity
613196,chr4,55079914,rs4421048,A,G,KDR,Heterozygous
613197,chr4,55081941,rs2412617,A,G,KDR,Heterozygous
613202,chr4,55085619,rs555692700,G,A,KDR,Heterozygous
613203,chr4,55089065,rs1531289,T,C,KDR,Heterozygous
613206,chr4,55097934,rs6838752,T,C,KDR,Heterozygous
613208,chr4,55098398,rs3816584,A,G,KDR,Heterozygous
613209,chr4,55098438,rs2305946,C,T,KDR,Heterozygous
613211,chr4,55099267,rs58415820,G,A,KDR,Heterozygous
613212,chr4,55102839,rs13136007,C,A,KDR,Heterozygous
613214,chr4,55106807,rs1870377,T,A,KDR,Heterozygous


In [45]:
vcf['Zygosity'].value_counts()

Heterozygous    19
Name: Zygosity, dtype: int64