<a href="https://colab.research.google.com/github/sajadamouei/classification-metabolomics/blob/main/correlation_metabolomics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# datasets information
import pandas as pd
import requests
from io import BytesIO
import os

# download the data sets from Github
url = "https://github.com/sajadamouei/classification-metabolomics/blob/febca0b56691c042aa438a2cd6cd3bba12b1d41b/data/metabol_data.xlsx?raw=true"

# send a request to the URL
response = requests.get(url)

# check if the request was successful
if response.status_code == 200:

    excel_file = pd.ExcelFile(BytesIO(response.content))

    # write the content
    file_name = "metabol_data.xlsx"
    with open(file_name, "wb") as file:
        file.write(response.content)

    # get the path of the dataset
    file_path = os.path.abspath(file_name)

    # iterate through each sheet
    for sheet_name in excel_file.sheet_names:

        df = pd.read_excel(excel_file, sheet_name=sheet_name)

        # print sheet name
        print(f"Sheet Name: {sheet_name}")

        # print the number of samples
        num_samples = df.shape[0]
        print(f"Number of Samples: {num_samples}")

        # print the number of metabolites
        num_metabolites = df.shape[1] - 2
        print(f"Number of Metabolites: {num_metabolites}")

        # print a summary of the 'class' column
        print("Summary of 'classes' column:")
        print(df['class'].value_counts())

        print("\n" + "="*40 + "\n")

    # print the dataset's address
    print("The metabolomics data set has been downloaded from the Github repository to the address:", file_path)

else:
    print("Failed to download the file. Status code:", response.status_code)


Sheet Name: M1 Compounds ESI+
Number of Samples: 81
Number of Metabolites: 1922
Summary of 'classes' column:
class
1    54
0    27
Name: count, dtype: int64


Sheet Name: M1 Compounds ESI-
Number of Samples: 81
Number of Metabolites: 939
Summary of 'classes' column:
class
1    54
0    27
Name: count, dtype: int64


Sheet Name: M1 Compounds ESI+ and ESI-
Number of Samples: 81
Number of Metabolites: 2861
Summary of 'classes' column:
class
1    54
0    27
Name: count, dtype: int64


The metabolomics data set has been downloaded from the Github repository to the address: /content/metabol_data.xlsx


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Lload data set
file_path = '/content/metabol_data.xlsx'
xls = pd.ExcelFile(file_path)
df = pd.read_excel(xls, sheet_name=2).drop(columns=['nu', 'class'])

df = df.loc[:, ~df.T.duplicated()]

# Calculate the correlation matrix
corr_matrix = df.corr()

# Extract the correlation values, excluding the diagonal (self-correlation)
corr_values = corr_matrix.values.flatten()
corr_values = corr_values[corr_values != 1]

# Calculate statistics
mean_corr = corr_values.mean()
median_corr = pd.Series(corr_values).median()
max_corr = corr_values.max()
min_corr = corr_values.min()
std_corr = corr_values.std()

# Print summary statistics
print(f"Correlation Statistics:\n"
      f"Mean: {mean_corr:.4f}\n"
      f"Median: {median_corr:.4f}\n"
      f"Standard Deviation: {std_corr:.4f}\n"
      f"Minimum: {min_corr:.4f}\n"
      f"Maximum: {max_corr:.4f}\n")

Correlation Statistics:
Mean: 0.0937
Median: 0.0758
Standard Deviation: 0.2214
Minimum: -0.8700
Maximum: 1.0000

