In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import LocalOutlierFactor
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

from scipy.stats import norm
import scipy.stats as st

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="background-color:#DC143C; font-family:'Brush Script MT',cursive;color:white;font-size:200%; text-align:center;border-radius: 50% 20% / 10% 40%">Variation Dataset Affecting Protein Tolerance</h1>

Reference: Thusberg J, Olatubosun A, Vihinen M. Performance of mutation pathogenicity prediction methods on missense variants. Hum Mutat. 2011, 32(4):358-68.   PUBMED  

**<span style="color:#DC143C;">Dataset of neutral single nucleotide polymorphisms</span>**


This is the neutral dataset or non synonymous coding SNV dataset comprising 23,683 human non synonymous coding SNVs with allele frequency >0.01 and chromosome sample count >49 from the dbSNP database build 131. This dataset was filtered for the disease-associated SNVs. The variant position mapping for this dataset was extracted from dbSNP database.

http://structure.bmc.lu.se/VariBench/tolerance_dataset1.php

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
df = pd.read_csv('../input/cusersmarildownloadsneutralcsv/neutral.csv', delimiter=';', encoding = "ISO-8859-2", nrows = nRowsRead)
df.dataframeName = 'neutral.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
df.head()

# **<span style="color:#DC143C;">VariBench and ClinVar Datasets</span>**

Citation: Alfredo Iacoangeli, Ahmad Al Khleifat, William Sproviero, Aleksey Shatunov, Ashley R. Jones, Sarah Opie-Martin, Ersilia Naselli, Simon D. Topp, Isabella Fogh, Angela Hodges, Richard J. Dobson, Stephen J. Newhouse & Ammar Al-Chalabi (2019) ALSgeneScanner: a pipeline for the analysis and interpretation of DNA sequencing data of ALS patients, Amyotrophic Lateral Sclerosis and Frontotemporal Degeneration, 20:3-4, 207-215, DOI: 10.1080/21678421.2018.1562553


"To assess their variant prioritization approach, we used a set of non-synonymous variants from the VariBench dataset for which the effect is known and all ALS-associated non-synonymous variants stored in ClinVar (71 benign and 121 pathogenic)."

"The VariBench variants are not ALS genes specifically, but because they are all annotated depending on whether or not they are deleterious, the general principles of the method could be tested. The dataset includes VariBench protein tolerance dataset (http://structure.bmc.lu.se/VariBench/tolerance_dataset1.php) comprising 23,683 human non-synonymous coding neutral SNPs and 19,335 pathogenic missense mutations."

Above is the respective (csv file) Variation Affecting Protein Tolerance: Neutral dataset or non synonymous coding SNV dataset. 

"None of the tools used in their pathogenicity score were trained on the VariBench dataset. However, it is possible that some VariBench variants were present in the training datasets. In order to minimize the overlap between training and evaluation sets, the authors derived a subset of variants (VariBenchFiltered) from the VariBench dataset by filtering out its overlap with HumVar, the CADD training dataset and ExoVar, which are commonly used to train the tools. The resulting dataset comprising 5051 pathogenic and 14,077 neutral variants, was balanced by randomly subsampling 5051 neutral variants.

https://www.tandfonline.com/action/showCitFormats?doi=10.1080%2F21678421.2018.1562553

In [None]:
df.isnull().sum()

In [None]:
cont_FEATURES = ['ContigPositionStart_0_based', 'ContigPositionEnd_0_based', 'mRNA_start_position_0_based', 'mRNA_end_position_0_based', 'ReadingFrame_base_position_in_codon', 'AminoAcidPosition_0_based']

cat_FEATURES = ['Contig_Acc_version', 'GenomeBuild', 'mRNA_acc_version']

#I didn't list all the the categorical features and the continuous features to plot less charts.

#Besides, I didn't know if the code will perform without returning errors. 

# **<span style="color:#DC143C;">(Some of the ) Outliers</span>**

In [None]:
#Code by Tom C  https://www.kaggle.com/tjcdev/tps-outliers-hidden-features-baseline/notebook

def plot_outliers(df, feature, threshold=5):
    mean, std = np.mean(df), np.std(df)
    z_score = np.abs((df-mean) / std)
    good = z_score < threshold

    print(f"Rejection {(~good).sum()} points")
    visual_scatter = np.random.normal(size=df.size)
    plt.scatter(df[good], visual_scatter[good], s=2, label="Good", color="#4CAF50")
    plt.scatter(df[~good], visual_scatter[~good], s=8, label="Bad", color="#F44336")
    plt.legend(loc='upper right')
    plt.title(feature)
    plt.show();
    
    return good

def plot_lof_outliers(df, feature):
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.001, p=1)
    good = lof.fit_predict(df) > 0.5 # change this value to set the threshold for outliers
    print(f"Rejection {(~good).sum()} points")
    
    visual_scatter = np.random.normal(size=df.size)
    plt.scatter(df[good], visual_scatter[good], s=2, label="Good", color="#4CAF50")
    plt.scatter(df[~good], visual_scatter[~good], s=8, label="Bad", color="#F44336")
    plt.legend(loc='upper right')
    plt.title(feature)
    plt.show();
    
    return good

 **<span style="color:#DC143C;">Feature Outliers</span>**

In [None]:
#Code by Tom C  https://www.kaggle.com/tjcdev/tps-outliers-hidden-features-baseline/notebook

for feature in cont_FEATURES:
    print(feature)
    plot_outliers(df[feature], feature)

In [None]:
#Code by Tom C  https://www.kaggle.com/tjcdev/tps-outliers-hidden-features-baseline/notebook

for feature in cont_FEATURES:
    # There some reshaping done here for syntax sake
    data = df[~df[feature].isna()][feature]
    plot_lof_outliers(data.values.reshape(data.shape[0], -1), feature)

In [None]:
#Code by Tom C  https://www.kaggle.com/tjcdev/tps-outliers-hidden-features-baseline/notebook

for feature in cont_FEATURES:
    sns.violinplot(x='ContigPositionStart_0_based', y=feature, data=df, inner='quartile');
    plt.title(feature)
    plt.show()

#It was suppose to be a violin. Maybe Violin Strings from a broken violin.

In [None]:
#Code by Tom C  https://www.kaggle.com/tjcdev/tps-outliers-hidden-features-baseline/notebook

for feature in cat_FEATURES:
    sns.histplot(df[feature].values)
    plt.show()

Messy charts.

In [None]:
#Code by Tom C  https://www.kaggle.com/tjcdev/tps-outliers-hidden-features-baseline/notebook

def plot_cdf(df, feature):
    ps = 100 * st.norm.cdf(np.linspace(-4, 4, 10)) # The last number in this tuple is the number of percentiles
    x_p = np.percentile(df, ps)

    xs = np.sort(df)
    ys = np.linspace(0, 1, len(df))

    plt.plot(xs, ys * 100, label="ECDF")
    plt.plot(x_p, ps, label="Percentiles", marker=".", ms=10)
    plt.legend()
    plt.ylabel("Percentile")
    plt.title(feature)
    plt.show();

for feature in cont_FEATURES:
    plot_cdf(df[feature], feature)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
##Code by Tom C  https://www.kaggle.com/tjcdev/tps-outliers-hidden-features-baseline/notebook
# This plots a 16x16 matrix of correlations between all the features and the target
# Note: I sometimes comment this out because it takes a few minutes to run and doesn't show any useful information.

pd.plotting.scatter_matrix(df, figsize=(10, 10));

In [None]:
fig, ax = plt.subplots(figsize=(10,10)) 
sns.heatmap(df.drop(columns=['AminoAcidPosition_0_based']).corr(), annot=True, cmap='viridis', fmt='0.2f', ax=ax)

In [None]:
#Code by Puru Behl https://www.kaggle.com/accountstatus/mt-cars-data-analysis

sns.distplot(df['AminoAcidPosition_0_based'])
plt.axvline(df['AminoAcidPosition_0_based'].values.mean(), color='red', linestyle='dashed', linewidth=1)
plt.title('AminoAcid Position 0 based Distribution')

In [None]:
#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#eb3434','#eb3446','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('Thank you Tom C @tjcdev  for all the script')