In [1]:
# Get current login user name & change directory to home.
import os
tmp = os.getlogin()
homedir = 'c:/Users/'+os.getlogin()+'/Documents/GitHub/Proteome-tool/'
os.chdir(homedir)

In [2]:
# import library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#from scipy.stats import poisson

df1 = pd.read_csv(filepath_or_buffer='./raw/PG_lncap-rwpe1_v1.csv', encoding='utf-8')
#df2 = pd.read_csv(filepath_or_buffer='./raw/(1)Protein_Groups_PCa_lncap2.csv', encoding='utf-8')

#g1 = df1['LNCaP/RWPE-1 normalized 1st'].tolist()
#g2 = df2['LNCaP/RWPE-1 normalized median-scaling 1st'].tolist()

# Divide every element in list to specific number
#g1[:] = [math.log2(x) for x in g1]

# Drop some columns
filt = ['Protein IDs', 'Protein names', 'Gene Names']
new_df = df1.drop(columns=filt, inplace=False)

# rename column names
cnames = {'LNCaP/RWPE-1 normalized 1st':'1st',
          'LNCaP/RWPE-1 normalized 2nd':'2nd',
          'LNCaP/RWPE-1 normalized 3rd':'3rd'}
new_df.rename(columns = cnames, inplace=True)

# Transform log2 scale
new_df_log2 = np.log2(new_df)

In [None]:
# Boxplot
sns.boxplot(data=new_df)

# set x-axis label
plt.xlabel("Batch", size=14)

# set y-axis label
plt.ylabel("DEP ratio", size=14)

# set title
plt.title("Boxplot of rawdata before Normalization", size=18)

# resize the figure
fig = plt.gcf()
fig.set_size_inches(4, 12)

plt.show()

In [None]:
# Kernel Density Estimate Plot
sns.kdeplot(data=new_df)
#new_df.plot.density(linewidth=1)

# set x-axis label
plt.xlabel("Difference", size=14)

# set y-axis label
plt.ylabel("Density", size=14)

# set title
plt.title("density plot of rawdata before Normalization", size=18)

fig = plt.gcf()
fig.set_size_inches(15, 5)

plt.show()

In [None]:
# Median normalization
# Huan et al. (2013) Nature Methods, 10(5), 390-393

new_df_median = new_df.div(new_df.median(axis=0), axis=1)
#new_df_median.plot.density(linewidth=1)

# Transform log2 scale
new_df_median_log2 = np.log2(new_df_median)

# Kernel Density Estimate Plot
sns.kdeplot(data=new_df_median_log2)
#new_df.plot.density(linewidth=1)

# set x-axis label
plt.xlabel("Difference", size=14)

# set y-axis label
plt.ylabel("Density", size=14)

# set title
plt.title("Density plot of Median-normalized", size=18)

fig = plt.gcf()
fig.set_size_inches(15, 5)

plt.show()

In [None]:
# Shapiro test
from scipy.stats import shapiro

tmp_df = new_df_median_log2

for col in tmp_df.columns:
    vals = tmp_df[col].values

    stat, p = shapiro(vals)

    print(f"Column '{col}': statistic={stat:.4f}, p-value={p:.4f}")

In [3]:
# One-way ANOVA (n=3)
import scipy.stats as stats

# Set Protein IDs to index
df1.set_index('Protein IDs', inplace=True)

# Drop columns
filt = ['Protein names', 'Gene Names']
data = df1.drop(columns=filt)

# rename column names
cnames = {'LNCaP/RWPE-1 normalized 1st':'1st',
          'LNCaP/RWPE-1 normalized 2nd':'2nd',
          'LNCaP/RWPE-1 normalized 3rd':'3rd'}

data.rename(columns = cnames, inplace=True)

# Transform log2 scale
data_log2 = np.log2(data)

# Perform one-way ANOVA test
# P-value > 0.05 : H0 ; No difference between in each group
# P-value < 0.05 : H1 ; There are difference --> post hoc turkey test

f_stat, p_value = stats.f_oneway(data['1st'], data['2nd'], data['3rd'])

# Display results
print("F-statistic: ", f_stat)
print("P-value: ", p_value)

F-statistic:  0.25722067001141075
P-value:  0.7732073561018324


In [None]:
# Quantile Normalization (Often used in microarray analysis)
df_sorted = pd.DataFrame(np.sort(new_df.values, axis=0), index = new_df.index, columns = new_df.columns)
df_mean = df_sorted.mean(axis=1)
df_mean.head()

df_mean.index = np.arange(1, len(df_mean) + 1)
df_mean.head()

# 동시에, 원본데이터프레임으로 부터 그룹별로 rank를 매김.
new_df.rank(method='min').astype(int)

# 원본 랭크를 토대로 ...
new_df.rank(method='min').stack().astype(int)

# Critical Step
df_qn = new_df.rank(method='min').stack().astype(int).map(df_mean).unstack()
df_qn.head()

def quantile_normalize(df):
    """
    input: dataframe with numerical columns
    output: dataframe with quantile normalized values
    """
    df_mean = df.mean(axis=1)
    df_mean.index = np.arange(1, len(df_mean) + 1)
    df_qn = df.rank(method='min').stack().astype(int).map(df_mean).unstack()
    return df_qn

In [None]:
# z-score normlize and shapiro normality test
from scipy.stats import zscore, shapiro
df_zscore = zscore(a=new_df, axis=0)
z1 = df_zscore['1st'].tolist()
shapiro(z1)

In [None]:
l1 = df_log['1st'].tolist()

In [None]:
from scipy.stats import shapiro
shapiro(l1)

In [None]:
# Histogram after-quantile norm
import numpy as np
import matplotlib.pyplot as plt
import math

plt.style.use('_mpl-gallery')
px = 1/plt.rcParams['figure.dpi'] # pixel in inches

# data
x = z1

# plot
fig, ax = plt.subplots(figsize=(600*px, 300*px)) # subplots(n, n, figsize)

ax.hist(x, bins=30, linewidth=0.3, edgecolor="black")
ax.set(xlim=(-5,5), xticks=np.arange(-5,5,step=1))
plt.show()

In [None]:
from scipy.stats import shapiro
s = shapiro(x)
s

In [None]:
# import library
import math
from scipy.stats import shapiro
import pandas as pd

g1 = new_df['1st'].tolist()
g1[:] = [math.log2(x) for x in g1]

s = shapiro(g1)
s

#ShapiroResult(statistic=0.9992809295654297, pvalue=0.9748974442481995)
# p-value < 0.05 -> H0 reject = not normal
# p-value > 0.05 -> H1 reject = Normal