In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/phenotype-genotype-integrator/PheGenI.csv')

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
df['P-Value']

# Converting P-Value column

P-Values have both float and object types

In [None]:
pv = df['P-Value'][df['P-Value'].apply(lambda x: isinstance(x, str))]

In [None]:
pv = pv.str.split('-')

In [None]:
pv = pd.to_numeric(pv.apply(lambda x: x[0][:-1])) * 10 ** (-pd.to_numeric(pv.apply(lambda x: x[1]), downcast='float'))

In [None]:
pv

In [None]:
df['P-Value'][pv.index] = pv.values

In [None]:
df['P-Value']

In [None]:
pd.to_numeric(df['P-Value'])

In [None]:
df['P-Value'][df['P-Value'].apply(lambda x: isinstance(x, str))]

In [None]:
df['P-Value'].sort_values()

In [None]:
df['P-Value'].sort_values().reset_index()['P-Value'][:].plot(figsize=(15,7))

In [None]:
df['P-Value'].sort_values().reset_index()['P-Value'][:134000].plot(figsize=(15,7))

# Analysis

In [None]:
df['Trait'].unique().size

In [None]:
df.groupby('Context').count().sort_values('Gene', ascending=False)

Below is the diagram describing various SNP types.

<img src="SNP types.jpg"/>

1. The “Near Gene” region includes the mRNA region of the gene as well as arbitrary regions of 2K nucleotides upstream and 0.5K nucleotides down stream to allow for potential regulatory regions. (https://www.ncbi.nlm.nih.gov/books/NBK44455/)
2. UTR-5 is the region that is directly upstream from the initiation codon which is transcribed to mRNA but not translated to Protein. UTR-3 is similar at downstream.
3. A SNP will be classified as “splice-site” if the SNP’s position is one or two bases before the start of an exon or If the SNP is located one or two bases following the end of an exon.


In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='Context', data=df, order=df['Context'].value_counts().index)

In [None]:
df[(df['Gene'] == df['Gene 2'])].groupby('Context').count().sort_values('Gene', ascending=False)

If Gene and Gene 2 names are same then SNP is not between two genes.

In [None]:
(df['Gene'] == df['Gene 2']).sum()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='Context', data=df[(df['Gene'] == df['Gene 2'])], order=df['Context'].value_counts().index)

In [None]:
df[(df['Gene'] != df['Gene 2'])].groupby('Context').count().sort_values('Gene', ascending=False)

In [None]:
df[(df['P-Value']>0) & (df['P-Value']<10**-300)].sort_values(by='P-Value')

P value less than 5x10^-8 is commonly accepted as threshold. (https://en.wikipedia.org/wiki/Genome-wide_significance)

In [None]:
df_p = df[df['P-Value'] < 5 * 10 ** -8]

In [None]:
df_p.groupby('Trait').count().sort_values('P-Value', ascending=False).head(10)

In [None]:
df_p.groupby('Trait').count().sort_values('P-Value', ascending=False)['Gene'].plot(figsize=(18,7))

In [None]:
df_p['Trait'].unique().size

In [None]:
traits = df_p.groupby('Trait').count().sort_values('P-Value', ascending=False).index

In [None]:
traits[:50]

# Genes by Trait

Function to search traits

In [None]:
import difflib

In [None]:
matches = difflib.get_close_matches('atherosclerosis', traits, n=15, cutoff=.4)
matches

In [None]:
def genes_by_trait(trait):
    temp = df_p[df_p['Trait']==trait]
    return set(temp['Gene']).union(set(temp['Gene 2']))

In [None]:
len(genes_by_trait('Body Mass Index'))

In [None]:
list_1 = ['Blood Pressure', 'Stroke', 'Diabetes Mellitus','Diabetes Mellitus, Type 2','Diabetes Mellitus, Type 1', 'Myocardial Infarction', 'Atherosclerosis', 'Plaque, Atherosclerotic']

In [None]:
factors_paired = [(i,j) for i in list_1 for j in list_1]

In [None]:
common_genes = []

for i,j in factors_paired:
    common_genes.append(len(genes_by_trait(i).intersection(genes_by_trait(j))))

In [None]:
common_genes = np.array(common_genes).reshape(len(list_1),len(list_1))

In [None]:
common_genes = pd.DataFrame(common_genes, index=list_1, columns=list_1)

In [None]:
common_genes

In [None]:
plt.figure(figsize=(15,5))
common_genes.style.background_gradient(cmap='YlOrRd', axis=0)

In [None]:
common_genes = genes_by_trait('Stroke').intersection(genes_by_trait('Diabetes Mellitus')).intersection(genes_by_trait('Blood Pressure'))
print(len(common_genes))
common_genes

In [None]:
matches = difflib.get_close_matches('inflammatory bowel', traits, n=15, cutoff=.4)
matches

In [None]:
list_2 = ['Multiple Sclerosis', 'Psoriasis', 'Lupus Erythematosus, Systemic', 'Crohn Disease', 'Inflammatory Bowel Diseases', 'Diabetes Mellitus, Type 1']

In [None]:
factors_paired = [(i,j) for i in list_2 for j in list_2]

In [None]:
common_genes = []

for i,j in factors_paired:
    common_genes.append(len(genes_by_trait(i).intersection(genes_by_trait(j))))

In [None]:
common_genes = np.array(common_genes).reshape(len(list_2),len(list_2))

In [None]:
common_genes = pd.DataFrame(common_genes, index=list_2, columns=list_2)

In [None]:
common_genes

In [None]:
plt.figure(figsize=(15,5))
common_genes.style.background_gradient(cmap='YlOrRd', axis=0)

Gene information from https://www.ensembl.org/index.html

In [None]:
import requests, sys
import pprint

In [None]:

server = "https://rest.ensembl.org"
ext = "/phenotype/gene/homo_sapiens/GCKR?include_associated=0"
     
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
     
if not r.ok:
    r.raise_for_status()
    sys.exit()
     
decoded = r.json()
pprint.pprint(decoded)