# Feature selection based on the genotypes qualities. 

In [1]:
import numpy as np
import pandas as pd
from typing import List, Tuple, Union, Callable

In [2]:
def dataset_reader(path: str, cls_ids: List):
    df = pd.read_csv(path)
    df.drop(['ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'FORMAT'], axis=1, inplace=True)
    df['INFO'] = df.INFO.str.split('\;.*$').str[0] 
    df['INFO'] = df['INFO'].str.replace('END=','') 
    df.columns = df.columns.str.replace('POS', 'StartPos')
    df.columns = df.columns.str.replace('INFO', 'EndPos')
    to_drop_cols = [item for item in df.columns if '-' in item]
    df.drop(to_drop_cols, axis=1, inplace=True)
    qualities = df.iloc[:, 3:].applymap(lambda item: item.split(':')[1])
    qualities = qualities.replace('.', 0).astype(np.int64)
    qualities = pd.concat([df.loc[:, ['#CHROM','StartPos', 'EndPos']], qualities], axis=1, sort=False)
    df = df.replace('\:.*$', '', regex=True).replace(['0/0', '1/1', './.'], cls_ids)
    return df, qualities

In [3]:
def feature_selection(genotypes, qualities, threshold=100): 
    cumulated_qualities = qualities.iloc[:, 3:].mean(axis=1)
    genotypes.drop(
        genotypes.loc[cumulated_qualities < threshold, :].index, 
        axis=0, 
        inplace=True)
    return genotypes

In [4]:
def dataset_concatenator(df1, df2, df3): 
    df = pd.concat([df1, df2, df3], axis=0, sort=False)
    df = df.sort_values(["#CHROM", "StartPos"], ascending = (True, True))
    df.drop(['#CHROM','StartPos', 'EndPos'], axis=1, inplace=True)
    df.fillna(0, inplace=True)
    df = df.T.reset_index(drop=False, inplace=False)
    return df

# Reading Data

In [5]:
inv_df, inv_qual = dataset_reader(path='/birl2/users/shs772/g2p/Shivani/Inversions.csv', cls_ids=[0, 1, -1])
dup_df, dup_qual = dataset_reader(path='/birl2/users/shs772/g2p/Shivani/Duplications.csv', cls_ids=[2, 3, -1])
del_df, del_qual = dataset_reader(path='/birl2/users/shs772/g2p/Shivani/Deletions.csv', cls_ids=[4, 5, -1])
original_labels = pd.read_csv('/birl2/users/shs772/g2p/Shivani/Arabi_Pheno/FT10_arabi.csv')

# Feature Selection

In [6]:
inv_df = feature_selection(inv_df, inv_qual, threshold=80)
dup_df = feature_selection(dup_df, dup_qual, threshold=80)
del_df = feature_selection(del_df, del_qual, threshold=80)

# Data Concatenation

In [7]:
genotypes = dataset_concatenator(inv_df, dup_df, del_df)

In [8]:
genotypes = genotypes.rename(columns = {'index': 'ID'})

In [9]:
genotypes['ID'] = genotypes['ID'].astype(np.int64)

In [10]:
genotypes.head()

Unnamed: 0,ID,0,1,2,3,8,7,14,24,28,...,25058,25060,124889,124890,124891,124896,124898,124899,124900,124902
0,1002,1.0,1.0,1.0,2.0,4.0,2.0,4.0,4.0,4.0,...,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
1,1006,1.0,1.0,1.0,2.0,4.0,2.0,4.0,4.0,4.0,...,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
2,1061,0.0,0.0,0.0,2.0,4.0,2.0,4.0,4.0,4.0,...,2.0,2.0,4.0,4.0,4.0,4.0,-1.0,-1.0,4.0,-1.0
3,1062,0.0,0.0,0.0,2.0,4.0,2.0,4.0,4.0,4.0,...,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
4,1063,0.0,0.0,0.0,2.0,4.0,2.0,4.0,4.0,4.0,...,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0


In [11]:
genotypes.to_csv('SVs_good_quality.csv', index=False)

In [12]:
SV_GQ = pd.read_csv('SVs_good_quality.csv')
original_labels = pd.read_csv('/birl2/users/shs772/g2p/Shivani/Arabi_Pheno/FT10_arabi.csv')

In [13]:
SV_GQ['ID'] = SV_GQ['ID'].astype(np.int64)
original_labels['ID'] = original_labels['ID'].astype(np.int64)

In [14]:
data = SV_GQ.merge(original_labels, on='ID')

In [15]:
data.drop('ID', axis=1, inplace=True)

In [16]:
data

Unnamed: 0,0,1,2,3,8,7,14,24,28,29,...,25060,124889,124890,124891,124896,124898,124899,124900,124902,Label
0,1.0,1.0,1.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,...,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,102.750000
1,0.0,0.0,0.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,...,2.0,4.0,4.0,4.0,4.0,-1.0,-1.0,-1.0,-1.0,128.000000
2,0.0,0.0,0.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,...,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,70.000000
3,1.0,1.0,1.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,...,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,98.750000
4,1.0,1.0,1.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,...,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,92.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909,0.0,0.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,97.750000
910,0.0,0.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,64.250000
911,0.0,0.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,61.333333
912,0.0,0.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,70.250000


In [17]:
data.rename(columns = {'Label':10000000}, inplace = True)
data

Unnamed: 0,0,1,2,3,8,7,14,24,28,29,...,25060,124889,124890,124891,124896,124898,124899,124900,124902,10000000
0,1.0,1.0,1.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,...,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,102.750000
1,0.0,0.0,0.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,...,2.0,4.0,4.0,4.0,4.0,-1.0,-1.0,-1.0,-1.0,128.000000
2,0.0,0.0,0.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,...,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,70.000000
3,1.0,1.0,1.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,...,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,98.750000
4,1.0,1.0,1.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,...,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,92.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909,0.0,0.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,97.750000
910,0.0,0.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,64.250000
911,0.0,0.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,61.333333
912,0.0,0.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,70.250000


In [18]:
data.to_csv('SV_GQ_80.csv', index=False)