# Exploratory Data Analysis of Kaggle Breast Cancer Proteomes

## Data Preparation and Cleaning

#### This is a Cancer Proteome Dataset from Kaggle

#### Load Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
import seaborn as sns

sns.set_style('darkgrid')
plt.rcParams['font.size'] = 22
plt.rcParams['font.family'] = 'Tahoma'
plt.rcParams['font.weight'] = 'bold'
plt.rcParams['figure.figsize'] = (9, 5)
plt.rcParams['figure.facecolor'] = '#00000000'

#### EDA

In [None]:
proteome=pd.read_csv('../input/breastcancerproteomes/77_cancer_proteomes_CPTAC_itraq.csv',header=0)

In [None]:
proteome.head(n=3)

In [None]:
proteome.shape

In [None]:
# extract gene description coloumns for downstream analysis
gene_name_df=proteome.loc[:,['RefSeq_accession_number','gene_symbol','gene_name']]
gene_name_df.set_index('RefSeq_accession_number',inplace=True)
gene_name_df.head()

In [None]:
# type of features in the data
data_type=proteome.dtypes.reset_index()
data_type.columns=["count","column_type"]
data_type.groupby("column_type").aggregate("count").reset_index()

### OBSERVATION
#### The dataset has 12553 rows. These rows are 12553 proteins identified by iTRAQ (annotated here their NCBI Refseq database accession number and gene name)  and 86 columns. These columns are Refseq ID, gene symbol, gene name and are 83 samples with their identifiers.
#### Thus format is rows as features and columns as samples
#### To make it biologically meaningful, we have to reshape the data to the conventional rows as samples and columns as features

In [None]:
# save the accession numbers to a list
proteins=proteome.RefSeq_accession_number.tolist() 

In [None]:
# dataset with numerical features only
proteome_samples=proteome.iloc[:,3:] 
proteome_samples.head()

In [None]:
reshaped_proteome=proteome_samples.transpose() # dataset transposed to features as columns and samples as rows
reshaped_proteome.columns=proteins # adding the column names
reshaped_proteome.head()

##### get NA values

In [None]:
# plot the distribution of NA values
na_val=reshaped_proteome.isna().sum()
na_val.sort_values(ascending=False).plot(kind='hist',bins=20)
plt.ylabel('frequency',fontsize=22,weight='bold')
plt.xlabel('NA values',fontsize=22,weight='bold')
plt.show()

In [None]:
# get features with NA values greater than 8
na_val[na_val>0].shape,na_val[na_val>8].shape

### 4559 features (proteins) have missing data in samples causing NA values, of them 3219 features have NA values greater than 8 samples. Such features with  missing values > 10% of sample size will cause errors and have to be removed

In [None]:
# get list of features with NA values greater than 8
na_val_index=na_val[na_val>8].index.tolist() 

In [None]:
# filter columns in dataset to remove feaures with NA values greater than 8
comp_cases=[n for n in reshaped_proteome.columns if n not in na_val_index]    

In [None]:
# number of features (proteins) with less than 8 NA values
len(comp_cases)

In [None]:
# filter dataset from features with NA >8 
fl_proteome=reshaped_proteome[comp_cases]
fl_proteome.shape

In [None]:
fl_proteome=fl_proteome.fillna(fl_proteome.mean()) # NA values filled with mean

In [None]:
np.any(fl_proteome.isna()) # check if NA values present

## **no NA values, all features are filled with values**

# Scaling the data and removing outliers

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# scaling the data
scaler=StandardScaler()
scaled_proteome=scaler.fit_transform(fl_proteome)
scaled_proteome=pd.DataFrame(scaled_proteome)
scaled_proteome.columns=fl_proteome.columns

In [None]:
# visualizing data before and after scaling
x=[np.random.randint(0,9000) for n in range(50)]
data1=fl_proteome.iloc[:,x]
data2=scaled_proteome.iloc[:,x]
fig,axs=plt.subplots(2,sharex=True,figsize=(25,8))
ag=sns.boxplot(data=data1,ax=axs[0])
ag.legend(['before scaling'],loc='lower right')
bg=sns.boxplot(data=data2,ax=axs[1])
bg.legend(['after scaling'],loc='lower right')
bg.set_xticklabels(x)
plt.xticks(rotation=90)
ag.set_title('features chosen randomly')
plt.show()

In [None]:
# removal of outliers
from scipy import stats
z = np.abs(stats.zscore(scaled_proteome))

threshold=3
z_bool=np.all(z<threshold,axis=0)

In [None]:
zscaled_proteome=scaled_proteome.loc[:,z_bool]

zscaled_proteome.shape # dataset after scaling and removing outliers


In [None]:
zscaled_proteome_lst=zscaled_proteome.columns.tolist()

The last 3 samples are healthy tissue.We have created a column 'Type', and labelled cancer tissue
as Cancer and healthy tissue as Healthy

In [None]:
fl_proteome['Type']="Cancer"
fl_proteome.iloc[-3:,-1]="Healthy"

### Cancer samples type

In [None]:
fl_proteome.iloc[:3,-1]

### Healthy samples type

In [None]:
fl_proteome.iloc[-3:,-1]

### Distribution of samples in the dataset

In [None]:
fl_proteome.Type.value_counts().plot(kind='bar',figsize=(4,3),rot=0)
plt.show()

### There is an imbalance in the 2 classes in this dataset

### There is an imbalance in the 2 classes in this dataset

### The distribution of fold change in proteins in the cancer samples

In [None]:
# we are choosing only cancer samples
cancer_proteome=fl_proteome.iloc[:-3,:-1]
cancer_proteome.shape

In [None]:
cancer_mean_list=cancer_proteome.mean().values.tolist()

In [None]:
cmean=cancer_proteome.mean()
cstd=cancer_proteome.std()

In [None]:
cmean.name='mean_cancer'
cstd.name='std_cancer'

In [None]:
# mean and median of the cancer samples
print(f'mean cancer dataset {cmean.mean()}, medain cancer dataset {cmean.median()}')

In [None]:
fig=plt.figure(figsize=(8,3))
sns.distplot(cancer_mean_list,hist=True,axlabel="Fold Change")
plt.legend(['cancer'])
plt.xlabel('Fold Change',fontsize=22,weight='bold')
plt.ylabel('Density',fontsize=22,weight='bold')
plt.show()

### Observation
### The distribution appears normal with negative skew with mean less than median

### The distribution of fold change in proteins in the healthy samples

In [None]:
# we are choosing healthy samples in the dataset
Healthy_proteome=fl_proteome.iloc[-3:,:-1]
Healthy_proteome.shape

In [None]:
healthy_mean_list=Healthy_proteome.mean().values.tolist()

In [None]:
hmean=Healthy_proteome.mean()
hstd=Healthy_proteome.std()

In [None]:
hmean.name='mean_healthy'
hstd.name='std_healthy'

In [None]:
# mean and median of the healthy samples
print(f'mean healthy dataset {hmean.mean()}, median healthy dataset {hmean.median()}')

In [None]:
fig=plt.figure(figsize=(8,3))
sns.distplot(healthy_mean_list,hist=True,axlabel="Fold Change")
plt.legend(['healthy'])
plt.xlabel('Fold Change',fontsize=22,weight='bold')
plt.ylabel('Density',fontsize=22,weight='bold')
plt.show()

### Observation
### The distribution appears as Normal distribution with mean greater than median

### superimposing both the cancer and healthy sample expression levels

In [None]:
fig=plt.figure(figsize=(10,4))
g=sns.distplot(cancer_mean_list,hist=True,axlabel="Fold Change")
h=sns.distplot(healthy_mean_list,hist=True,axlabel="Fold Change")
plt.legend(['cancer','healthy'])
plt.xlabel('Fold Change',fontsize=22,weight='bold')
plt.ylabel('Density',fontsize=22,weight='bold')
plt.show()

### Observation
### There is a shift in the abundance of proteins in cancer samples and a remarkable uniformity in their abundance, as compared to normal samples. It would be worthwhile to see proteins overexpressed in cancer as compared to healthy samples. iTRAQ measures protein abundance. The levels of protein can fall by proteolytic  processing and/or lower expression. The levels will rise due to increased expression

In [None]:
proteome_foldchange_df=pd.concat([gene_name_df,cmean,cstd,hmean,hstd],axis=1,join='inner')

In [None]:
proteome_foldchange_df['log_coeffvar_cncr']=np.log10(np.abs(proteome_foldchange_df.std_cancer/proteome_foldchange_df.mean_cancer))
proteome_foldchange_df['log_coeffvar_hlth']=np.log10(np.abs(proteome_foldchange_df.std_healthy/proteome_foldchange_df.mean_healthy))

In [None]:
proteome_foldchange_df

### The changes in cancer would be due to overexpression of genes or repression of genes, and this will be reflected in the levels of the protein. Since absolute levels are not given, we  cannot compare the levels of protein in cancer and healthy samples directly. To compare cancer and healthy samples,we shall use the concept of Housekeeping genes. The expression of these genes is fairly constant and the proteins are present at constant levels. Prominently GAPDH, beta actin, beta tubulin are also used as loadng controls in western blots. A paper also cites that ribosome protein levels are fairly constant (Geiger T, Wehner A, Schaab C, Cox J, Mann M.  Mol Cell Proteomics. 2012 Mar;11(3))

## Housekeeping Genes

In [None]:
housekeeping=['NP_000968','NP_000966','NP_001013','NP_001017963','NP_057004','NP_821133','NP_001092','NP_002037','NP_000960']

### List of housekeeping proteins

In [None]:
proteome_foldchange_df.loc[housekeeping,:]

In [None]:
house_names=['RPL13','RPL11','RPS19','HSP90','RPS27','TUBB','ACTB','GAPDH','RPL5']

In [None]:
housekeeping_cancer=cancer_proteome[housekeeping]

In [None]:
housekeeping_healthy=Healthy_proteome[housekeeping]

In [None]:
fig,axs=plt.subplots(2,sharex=True,figsize=(16,8))
ghk=sns.boxplot(data=housekeeping_cancer,ax=axs[0])
ghk.set_title('Fold Change of Housekeeping Genes')
ghk.legend(["cancer"],loc='lower right')
hhk=sns.boxplot(data=housekeeping_healthy,ax=axs[1])
hhk.legend(["healthy"],loc='lower right')
hhk.set_xticklabels(house_names,rotation=90)
hhk.set(ylim=(-3,3))
plt.xticks(rotation=60)
plt.show()

## Observation
### Ribosomal protein L11 (RPL11) and beta tubulin (TUBB) can be suitably used to study cancer related protein changes. We shall use beta tubulin, as we can co-relate our study with previous experimental studies 

### proteins with fold change greater than beta-tubulin 

In [None]:
proteome_foldchange_df['corrected_cancer']=proteome_foldchange_df.mean_cancer-0.257342
proteome_foldchange_df['corrected_health']=proteome_foldchange_df.mean_healthy-(-0.463977)

In [None]:
proteome_foldchange_df['foldchange_cancer']=proteome_foldchange_df['corrected_cancer']-proteome_foldchange_df['corrected_health']

In [None]:
proteome_foldchange_df['composite_coeff']=proteome_foldchange_df['log_coeffvar_cncr']+proteome_foldchange_df['log_coeffvar_hlth']

In [None]:
cancer_altered_df=proteome_foldchange_df.loc[:,['gene_name','corrected_cancer','corrected_health','foldchange_cancer','composite_coeff']]

In [None]:
fig=plt.figure()
fig=sns.scatterplot(data=cancer_altered_df, x='foldchange_cancer',y='composite_coeff')
fig.set_ylim(-2,5)
fig.set_xlabel('Fold change cancer',fontsize=22,weight='bold')
fig.set_ylabel('composite variation',fontsize=22,weight='bold')
plt.show()

In [None]:
# test for normality of 'foldchange' in cancer_altered_df
from scipy import stats
stats.kstest(cancer_altered_df.foldchange_cancer,'norm',alternative='greater')

### foldchange values are normally distributed

In [None]:
# calculate z-score  for foldchange 
cancer_altered_df['zscore_foldchange']=np.abs(stats.zscore(cancer_altered_df.foldchange_cancer))

In [None]:
# calculate p-value for foldchange
from scipy.special import ndtr as ndtr
cancer_altered_df['pvalue_foldchange']=1- ndtr(cancer_altered_df.zscore_foldchange)

In [None]:
cancer_altered_df.head()

In [None]:
# proteins with statistically significant foldchange 
significant_fold_change=cancer_altered_df[cancer_altered_df.pvalue_foldchange<0.025]

In [None]:
significant_fold_change.shape

In [None]:
# upregulated proteins in cancer
significant_fold_change[significant_fold_change.corrected_cancer>significant_fold_change.corrected_health].sort_values(by='foldchange_cancer',ascending=False).head(n=10)

In [None]:
# downregulated proteins in cancer
significant_fold_change[significant_fold_change.corrected_cancer<significant_fold_change.corrected_health].sort_values(by='foldchange_cancer').head(n=10)

In [None]:
!conda install -c plotly plotly --yes

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatter(x=significant_fold_change['foldchange_cancer'],
                                y=significant_fold_change['composite_coeff'],
                                mode='markers',
                                marker_color=significant_fold_change['corrected_cancer'],
                                text=significant_fold_change['gene_name'])) # hover text goes here

fig.update_yaxes(range=[-5, 5])
fig.update_layout(title='Altered proteins in Breast Cancer')
fig.show()

In [None]:
import re
names=cancer_altered_df.gene_name.tolist()

In [None]:
def rep_gene(name):
    try:
        var=re.findall('^brc+',name,re.IGNORECASE)
        if var:
            return name
    except:
        print('wrong input')

In [None]:
x=list(filter(rep_gene,names))

In [None]:
cancer_altered_df[cancer_altered_df.gene_name.isin(x)]