In [None]:
import pandas as pd
import re

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style(style='whitegrid')
sns.set(font_scale=1.5);
import missingno as msno

In [None]:
df = pd.read_csv('../input/clinvar-conflicting/clinvar_conflicting.csv', dtype={'CHROM': str, 38: str, 40: object})

In [None]:
df.describe()

In [None]:
df.CLASS

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.CHROM.dtype

In [None]:
df.shape

In [None]:
df.CHROM.value_counts()

In [None]:
from plotnine import *
GD=df.head(1000)

(
    ggplot(GD)
        + aes('AF_EXAC', 'AF_TGP')
        + geom_point()
        + stat_smooth()
)

In [None]:
ax = sns.countplot(x="CLASS", data=df, palette="Set1")
ax.set(xlabel='CLASS', ylabel='Number of Variants');

In [None]:
msno.dendrogram(df);

In [None]:
msno.bar(df,color='#79ccb3',sort='descending')
plt.show()

In [None]:
msno.heatmap(df)
plt.show()

In [None]:
gene_ct = pd.crosstab(df.SYMBOL, df.CLASS, margins=True)

In [None]:
gene_ct = pd.crosstab(df.SYMBOL, df.CLASS, margins=True)
gene_ct.drop('All', axis=0, inplace=True)

# limit to the 50 most submitted genes for visualization
gene_ct = gene_ct.sort_values(by='All', ascending=False).head(50)
gene_ct.drop('All', axis=1, inplace=True)

gene_ct.plot.bar(stacked=True, figsize=(12, 4));

In [None]:
vt_ct = pd.crosstab(df.CLNVC, df.CLASS, margins=True)
vt_ct.drop('All', axis=0, inplace=True)

# limit to the 50 most submitted genes for visualization
vt_ct = vt_ct.sort_values(by='All', ascending=False)
vt_ct.drop('All', axis=1, inplace=True)

vt_ct.plot.bar(stacked=True, figsize=(12, 4));

In [None]:
from plotnine import * 

(ggplot(df.head(50))
         + aes('POS', 'Amino_acids')
         + geom_bin2d(bins=20)
         + ggtitle("Most Common amino acids")
)

In [None]:
df.EXON.fillna('0', inplace=True)
df['variant_exon'] = df.EXON.apply(lambda x: [int(s) for s in re.findall(r'\b\d+\b', x)][0])

In [None]:
exondf = pd.crosstab(df['variant_exon'], df['CLASS'])
exondf.plot.bar(stacked=True, figsize=(20, 5));
plt.xlim(-0.5, 20.5);

In [None]:
MC_list = df.MC.dropna().str.split(',').apply(lambda row: list((c.split('|')[1] for c in row)))
MC_encoded = pd.get_dummies(MC_list.apply(pd.Series).stack()).sum(level=0)
MC_encoded = MC_encoded.reindex(index=MC_list.index)

# Incorporate the transformed MC feature into the existing DataFrame
df = df.join(MC_encoded).drop(columns=['MC'])

# Transformed MC feature
MC_encoded.head()

In [None]:
mccounts= {0: {},
           1: {},
           'All': {}
          }

for col in MC_encoded.columns:
    for class_ in [0, 1]:
        mccounts[class_][col] = df.loc[df['CLASS'] == class_][col].sum()
    
    mccounts['All'][col] = df[col].sum()
    
mc_ct = pd.DataFrame.from_dict(mccounts)

mc_ct_all = mc_ct.sum(axis=0)
mc_ct_all.name = 'All'
mc_ct = mc_ct.append(mc_ct_all, ignore_index=False)

In [None]:
mc_ct.drop('All', axis=0, inplace=True)

mc_ct = mc_ct.sort_values(by='All', ascending=False)
mc_ct.drop('All', axis=1, inplace=True)

mc_ct.plot.bar(stacked=True, figsize=(12, 4));

In [None]:
df['CLASS'].plot.hist()

In [None]:
sift_ct = pd.crosstab(df.SIFT, df.CLASS, margins=True)
sift_ct.drop('All', axis=0, inplace=True)

# limit to the 50 most submitted genes for visualization
sift_ct = sift_ct.sort_values(by='All', ascending=False)
sift_ct.drop('All', axis=1, inplace=True)

sift_ct.plot.bar(stacked=True, figsize=(12, 4));

In [None]:
df = pd.get_dummies(df, columns=['SIFT', 'PolyPhen'])

In [None]:
from itertools import combinations
from scipy.stats import chi2_contingency

In [None]:
categoricals_index = pd.MultiIndex.from_tuples(combinations(['CHROM', 'REF', 'ALT', 'IMPACT', 'Consequence', 'SYMBOL', 'CLASS'], 2))
categoricals_corr = pd.DataFrame(categoricals_index, columns=['cols'])

In [None]:
def chisq_of_df_cols(row):
    c1, c2 = row[0], row[1]
    groupsizes = df.groupby([c1, c2]).size()
    ctsum = groupsizes.unstack(c1)
    return chi2_contingency(ctsum.fillna(0))[1]

In [None]:
categoricals_corr[ 'chi2_p'] =  categoricals_corr.cols.apply(chisq_of_df_cols)

In [None]:
categoricals_corr

In [None]:
categoricals_corr.index = categoricals_index
categoricals_corr = categoricals_corr.chi2_p.unstack()

In [None]:
categoricals_corr

In [None]:
corr = df.select_dtypes(exclude='object').corr()

import numpy as np
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 12));

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True);

# Draw the heatmap with the mask and correct aspect ratio
g = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5});


from matplotlib.patches import Rectangle

g.add_patch(Rectangle((1, 6), 3, 1, fill=False, edgecolor='red', lw=4));

In [None]:
snvs = df.loc[(df.REF.str.len()==1) & (df.ALT.str.len()==1)]
indels = df.loc[(df.REF.str.len()>1) | (df.ALT.str.len()>1)]

In [None]:
len(df) == (len(snvs) + len(indels))

In [None]:
snp_indel = pd.concat([snvs.CLASS.value_counts(normalize=True).rename('snv_class'), 
                       indels.CLASS.value_counts(normalize=True).rename('indel_class')], 
                      axis=1).T

In [None]:
snp_indel.plot.bar(stacked=True, figsize=(12, 4));

In [None]:
clndn = pd.concat([df.CLASS.loc[(df.CLNDN=='not_specified') | (df.CLNDN=='not_provided') | (df.CLNDN=='not_specified|not_provided')].value_counts(normalize=True).rename('disease_not_specified'), 
                       df.CLASS.loc[(df.CLNDN!='not_specified') | (df.CLNDN!='not_provided') | (df.CLNDN!='not_specified|not_provided')].value_counts(normalize=True).rename('some_disease_specified')], 
                      axis=1).T

In [None]:
clndn.plot.bar(stacked=True, figsize=(12, 4));

In [None]:
sns.distplot(df.AF_ESP, label="AF_ESP")
sns.distplot(df.AF_EXAC, label="AF_EXAC")
sns.distplot(df.AF_TGP, label="AF_TGP")
plt.legend();