In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

In [None]:
df=pd.read_csv("/kaggle/input/opentargets-drug-subcellularlocations/all_combined_data.tsv", sep="\t")

In [None]:
df

In [None]:
# Assuming the data is loaded into a pandas dataframe called df
sns.FacetGrid(df, col="subcellular_location_label") \
    .map(sns.scatterplot, "drugType", "actionType", alpha=0.7)

plt.show()


In [None]:
drug_location_counts = df.groupby(["drugType", "subcellular_location_label"]).size().to_frame(name="count").reset_index()

In [None]:
drug_location_counts["log_count"] = np.log2(drug_location_counts["count"])

In [None]:

# Filter data for "Antibody" and extract counts for desired locations
antibody_data = contingency_table_labels["Antibody"]
antibody_counts = antibody_data.values.T  # Transpose to get locations as columns

# Create the violin plot
sns.violinplot(
    data=antibody_counts,
    showmeans=True,  # Show mean as a point
    palette="Set2"  # Use a color palette for better visualization
)

# Customize the plot (optional)
plt.xlabel("Subcellular Location")
plt.ylabel("Antibody Count")
plt.title("Distribution of Antibody Counts Across Locations")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.axhline(
    y=antibody_counts.mean(), color="red", linestyle="--", label="Mean Antibody Count"
)  # Add a line for mean

# Highlight "Cytoskeleton" with a different color or marker
sns.swarmplot(
    x=antibody_data,
    y=antibody_data["Cytoskeleton"],  # Select the first data column (counts)
    color="blue",  # Highlight "Cytoskeleton" with blue
    size=8,  # Adjust marker size for better visibility
)

# Add legend
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Create the distribution plot
sns.displot(
    x="log_count",
    hue="drugType",
    kind="kde",  # Use kernel density estimation (KDE) for smoother distribution
    data=drug_location_counts,
)
plt.show()


In [None]:
contingency_table_locations = pd.crosstab(df['subcellular_location'], df['drugType'])

In [None]:
contingency_table_labels = pd.crosstab(df['subcellular_location_label'], df['drugType'])

In [None]:
contingency_table_labels

In [None]:
df_percentages_drugtypes

In [None]:
## Is Antibody drug type significanty binds to Cytoskeleton?
print(chi2_contingency(contingency_table_labels.loc['Cytoskeleton']))

df_percentages_loclabels.loc["Cytoskeleton"]
## H_null : every drug type binds to Cytoskeleton
cyto_expected = np.array([100/8]*8)

##X^2 test
statistic, pvalue = chisquare(df_percentages_loclabels.loc["Cytoskeleton"], f_exp=cyto_expected)
## Power_divergenceResult(statistic=333.4116894879855, pvalue=4.369128558389444e-68) -> reject H_null

## H_null: antibody drugs bind everywhere
antibody_expected = np.array([100/25]*25)
chisquare(df_percentages_drugtypes.loc[:,"Antibody"], f_exp=antibody_expected)
## Power_divergenceResult(statistic=889.6115966071326, pvalue=2.3060314218578738e-172) -> reject H_null



In [None]:
contingency_table_labels["Antibody"]["Cytoskeleton"]

In [None]:
antibody_cyto = contingency_table_labels["Antibody"]["Cytoskeleton"]

In [None]:
antibody_not_cyto = contingency_table_labels["Antibody"].sum() - antibody_cyto

In [None]:
not_antibody_cyto

In [None]:
not_antibody_cyto = contingency_table_labels.loc["Cytoskeleton"].sum() - antibody_cyto

In [None]:
not_antibody_not_cyto = contingency_table_labels.sum().sum() -  not_antibody_cyto - antibody_cyto - antibody_not_cyto

In [None]:
contingency_table_labels.sum().sum() - antibody_cyto

In [None]:
# Function to calculate significance for a specific drug-location pair
def test_significance(data, column_name, row_name):
    """
    This function tests the significance of the association between a specific drug type and a subcellular location using Fisher's exact test.

    Args:
      data: cross tab with rows indexed 
      column_name: column_name to analyze.
      row_name: row_name location to analyze.

    Returns:
      A tuple containing the p-value and odds ratio from the Fisher's exact test.
    """
    from scipy.stats import fisher_exact

    # Get contingency table for drug type vs location
    contingency_table = [[0, 0], [0, 0]]
    row_column = data[column_name][row_name]
    not_row_all_column = data[column_name].sum() - row_column
    all_row_not_column = data.loc[row_name].sum() - row_column
    not_row_not_column = data.sum().sum() - not_row_all_column - all_row_not_column + row_column
    contingency_table[0][0] = row_column ## column+row sum
    contingency_table[1][0] = all_row_not_column ## row - column
    contingency_table[0][1] = not_row_all_column
    contingency_table[1][1] = not_row_not_column

    # Perform Fisher's exact test
    odds_ratio, p_value = fisher_exact(contingency_table)

    return p_value, odds_ratio

In [None]:
# Test significance for Antibody-Cytoskeleton binding
drug_type = "Antibody"
location = "Cytoskeleton"
p_value, odds_ratio = test_significance(contingency_table_labels.copy(), drug_type, location)

# Print the results
print(f"Significance of {drug_type} binding to {location}:")
print(f"- p-value: {p_value:.4f}")
print(f"- Odds ratio: {odds_ratio:.4f}")

# Interpretation example (adjust based on your significance level)
if p_value < 0.05:
  print(f"{drug_type} binding to {location} is statistically significant.")
else:
  print(f"There is not enough evidence to conclude that {drug_type} binding to {location} is statistically significant.")

In [None]:
contingency_table_labels = contingency_table_labels.drop(columns="Unknown")

In [None]:
## Normalize the counts
import pandas as pd
from sklearn import preprocessing

x = contingency_table_labels.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(x_scaled, index=contingency_table_labels.index, columns=contingency_table_labels.columns)

In [None]:
df_normalized = contingency_table_labels.apply(lambda x: (x-x.mean())/ x.std())

In [None]:
df_normalized = contingency_table_labels.apply(lambda x: (x-x.min())/(x.max()-x.min()))

In [None]:
df_normalized

In [None]:
df_percentages_drugtypes = contingency_table_labels.apply(lambda x: x*100/x.sum())

In [None]:
df_percentages_loclabels = contingency_table_labels.T.apply(lambda x: x*100/x.sum()).T

In [None]:
grand_total = contingency_table_labels.sum().sum()
for idx, location in enumerate(list(contingency_table_labels.index)):
    row_sum = contingency_table_labels.loc[location].sum()
    col_sum = contingency_table_labels[:,idx].sum()
    
    expected_array[idx,:] = contingency_table_labels.loc[location].sum()/len(contingency_table_labels.columns)

In [None]:
chi2_contingency(df_percentages)

In [None]:
from scipy.stats import chisquare

In [None]:
df_percentages["Antibody"]

In [None]:
antibody_obs = contingency_table_labels.T.iloc[0]
antibody_exp = np.array([])

In [None]:
np.array([df_normalized.T.iloc[0].sum()/25]*25)

In [None]:
from scipy.stats import chisquare
for idx, drugtype in enumerate(df_percentages.columns):
    print(drugtype)
    statistic, p_value = chisquare(df_percentages.T.iloc[idx], f_exp=np.array([df_percentages.T.iloc[idx].sum()/25]*25))
    print(f"significant: {p_value < 0.05}")

In [None]:
from scipy.stats import chisquare
for idx, location in enumerate(df_percentages_loclabels.T.columns):
    print(location)
    statistic, p_value = chisquare(df_percentages_loclabels.iloc[idx], f_exp=np.array([df_percentages_loclabels.iloc[idx].sum()/8]*8))
    print(f"significant: {p_value < 0.05}")

In [None]:
critical_value_labels = stats.chi2.ppf(q=0.95, df=9)

In [None]:
critical_value_labels

In [None]:
chi2_contingency(contingency_table_labels, correction=True)# --> significant
## There is an association between locations and drug types
## statistic > critical_value_labels --> we can reject the null hypothesis

In [None]:
import pandas as pd
import numpy as np

# Assuming your data is stored in a DataFrame called df
contingency_table_labels.corr(method='spearman')


In [None]:
all_counts = drug_location_counts["count"].sum()

In [None]:
drug_location_counts["drugType"].unique() #--> 9 categories

In [None]:
drug_location_counts["subcellular_location_label"].nunique() #-> 25 categories

In [None]:
expected_counts= np.array(drug_location_counts["count"].sum()/(9*25), dtype="float64")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Provided data as a dictionary (assuming you have it stored this way)
data = {
    "Cell junction": [32, 0, 0, 0, 5, 0, 14, 320, 4],
    "Cell membrane": [291, 0, 9, 1, 9, 0, 132, 2456, 51],
    "Cellular component": [2564, 12, 88, 37, 245, 36, 1049, 25098, 732],
    # ... (rest of the subcellular locations)
}

# Convert data to a Pandas DataFrame
df = pd.DataFrame(data, columns=[
        "Antibody", "Cell", "Enzyme", "Gene", "Oligonucleotide", "Oligosaccharide",
        "Protein", "Small molecule", "Unknown", "Antibody", "Cell", "Enzyme", "Gene",
        "Oligonucleotide", "Oligosaccharide", "Protein", "Small molecule", "Unknown",
        # ... (rest of the data)
    ])

# Function to calculate percentages for each drug type (optional, can be reused from previous code)
def calculate_percentages(data):
  # ... (code from previous example to calculate percentages)
  return percentages

# Calculate percentages if needed (uncomment the following line)
# percentages = calculate_percentages(data.copy())

# Select a subset for Antibody and locations (assuming percentages are already calculated)
antibody_data = df["Antibody"]  # Select rows for Antibody
location_subset = list(antibody_data)[:-1]  # Select all columns except drugType

# Use percentages if calculated, otherwise use raw counts
antibody_counts = antibody_data[location_subset].values  # Assuming percentages are already calculated

# Create the heatmap
plt.figure(figsize=(10, 6))
heatmap = sns.heatmap(antibody_counts, cmap="YlGnBu", annot=True, fmt=".1f")  # Adjust colormap and format as needed

# Highlight Antibody and Cytoskeleton cell
heatmap.cell_text(row_loc=0, col_loc=location_subset.index(
    "Cytoskeleton"), textprops={"color": "red", "weight": "bold"})  # Highlight the cell

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)
plt.xlabel("Subcellular Location")
plt.ylabel("Antibody Count (or Percentage)")
plt.title("Heatmap of Antibody Distribution (Highlighted: Cytoskeleton)")
plt.show()


In [None]:
from scipy import stats

In [None]:
chi2, p = stats.scipy.stats.chi2_contingency(drug_location_counts["count"], f_exp=expected_counts)

In [None]:
contingency_table_labels = contingency_table_labels.drop(columns=["Unknown"])

In [None]:
import pandas as pd
from sklearn import preprocessing

x = contingency_table_labels.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(np.log2(x_scaled+1), index=contingency_table_labels.index, columns=contingency_table_labels.columns)

In [None]:
sns.heatmap(df_normalized, cmap="viridis")

In [None]:
df_normalized.hist()

In [None]:
plt.hist(contingency_table_labels)

In [None]:
import matplotlib.pyplot as plt

# Assuming you have your data in a dictionary format similar to the previous example
data = {
    "drugType": [
        "Antibody", "Cell", "Enzyme", "Gene", "Oligonucleotide", "Oligosaccharide",
        "Protein", "Small molecule", "Unknown"
    ],
    "Cell junction": [32, 0, 0, 0, 5, 0, 14, 320, 4],
    "Cell membrane": [291, 0, 9, 1, 9, 0, 132, 2456, 51],
    "Cellular component": [2564, 12, 88, 37, 245, 36, 1049, 25098, 732],
    # ... (rest of the subcellular locations)
}

# Function to plot the stacked bar chart
def plot_stacked_bar_chart(data):
  """
  This function plots a stacked bar chart showing the distribution of drug types across subcellular locations.

  Args:
      data: A dictionary containing drug types and their corresponding counts for each subcellular location.
  """
  # Extract drug types and locations
  drug_types = data["drugType"]
  locations = [loc for loc in data if loc != "drugType"]
  print(locations)
  # Extract data for each drug type
  drug_type_data = []
  for drug_type in drug_types:
    drug_type_data.append([data[loc][drug_types.index(drug_type)] for loc in locations])

  print(drug_type_data)
  # Create the stacked bar chart
  plt.figure(figsize=(10, 6))  # Adjust figure size as needed
  plt.stackplot(locations, drug_type_data, labels=drug_types)
  plt.xlabel("Subcellular Location")
  plt.ylabel("Count")
  plt.title("Distribution of Drug Types Across Subcellular Locations")
  plt.xticks(rotation=45, ha="right")  # Rotate x-axis labels for readability
  plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
  plt.tight_layout()
  plt.show()

# Plot the stacked bar chart
plot_stacked_bar_chart(data.copy())


In [None]:
drug_types = [dt for dt in contingency_table_labels.columns]
locations = [loc for loc in contingency_table_labels.index]

drug_types_data = [np.log2(contingency_table_labels[dt]+1) for dt in drug_types]

In [None]:
df_percentages_loclabels

In [None]:
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
df_percentages_loclabels.plot(x=, kind="bar", stacked=True, use_index=True)
plt.xlabel("Subcellular Location")
plt.ylabel("Percentages")
plt.title("Distribution of Drug Types Across Subcellular Locations")
plt.xticks(rotation=45, ha="right")  # Rotate x-axis labels for readability
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.savefig("drug_distribution_locations.png")
plt.show()