See leak explanation in [this post](https://www.kaggle.com/c/recursion-cellular-image-classification/discussion/102905).

RPE-03 and HUVEC-07 have the same pattern of controls, but not only… also the treatments are in the same pattern, only the plates are rotated.

Now, there is an experiment in the test set that has the same pattern of controls… HUVEC-18. See images below to demonstrate what I said.

Does HUVEC-18 also have the same pattern of treatments in some plate rotation?

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib as mpl
from matplotlib import pyplot as plt
import sys

df_train = pd.read_csv("../input/train.csv")
df_train_contr = pd.read_csv("../input/train_controls.csv").drop(labels="well_type", axis=1)
df_test = pd.read_csv("../input/test.csv")
df_test_contr = pd.read_csv("../input/test_controls.csv").drop(labels="well_type", axis=1)

In [None]:
df_train.head(2)

In [None]:
df_train_contr.head(2)

In [None]:
# extract row and column number from well
df_train["row"] = df_train["well"].apply(lambda x: ord(x[0].lower()) - 96)
df_train["col"] = df_train["well"].apply(lambda x: int(x[1:]))
df_train_contr["row"] = df_train_contr["well"].apply(lambda x: ord(x[0].lower()) - 96)
df_train_contr["col"] = df_train_contr["well"].apply(lambda x: int(x[1:]))
df_test_contr["row"] = df_test_contr["well"].apply(lambda x: ord(x[0].lower()) - 96)
df_test_contr["col"] = df_test_contr["well"].apply(lambda x: int(x[1:]))
df_train.sample(n=5)

In [None]:
# create ordered list of treatment sirnas with group1+group2+...+group4
sirnas = []
exp = "HEPG2-03" # select experiment that has all sirnas
df_exp = df_train.groupby("experiment").get_group(exp)
for plate, df_exp_pl in df_exp.groupby("plate"):
    ss = sorted(df_exp_pl["sirna"].unique())
    sirnas += ss
    print("Plate {} has {} sirnas.".format(plate, df_exp_pl["sirna"].nunique()))
    print("   First 10 in ordered group:", ss[:10])

In [None]:
# write sirna groups to a dataframe to save as output
pd.DataFrame(data={"sirna" : sirnas, 
                   "group" : [i for i in range(1,5) for j in range(277)]}).to_csv("sirna_groups.csv", index=False)

In [None]:
# assign unique colors to treatment sirnas
sirnas_colormaps = {1 : "Blues", 2 : "Greens", 3 : "Purples", 4 : "Reds"}
colors = []
for plate in [1,2,3,4]:
    colors += [mpl.cm.get_cmap(sirnas_colormaps[plate])(i) for i in np.linspace(0., 1., 277)]
    
sirnas_colors_dict = dict(zip(sirnas, colors))
df_train["color"] = df_train["sirna"].map(sirnas_colors_dict)
df_train.head()

In [None]:
# create ordered list of control sirnas
contr_sirnas = sorted(df_train_contr["sirna"].unique())
print(f"{len(contr_sirnas)} control sirnas.")

In [None]:
# assign unique colors to control sirnas
colors = [mpl.cm.get_cmap("hsv")(i) for i in np.linspace(0., 1., 31)]
contr_sirnas_colors_dict = dict(zip(contr_sirnas, colors))
df_train_contr["color"] = df_train_contr["sirna"].map(contr_sirnas_colors_dict)
df_test_contr["color"] = df_test_contr["sirna"].map(contr_sirnas_colors_dict)
df_train_contr.head()

Find frequency of group assignment patterns.

In [None]:
df_pattern = pd.DataFrame(index=df_train["experiment"].unique())
df_pattern["pattern"] = ""
for exp, df_exp in df_train.groupby("experiment"):
    pattern = ""
    for plate, df_exp_pl in df_exp.groupby("plate"):
        sirna_sample = df_exp_pl["sirna"].values[0]
        group_sirna_sample = sirnas.index(sirna_sample) // 277 + 1
        pattern += str(group_sirna_sample)
    df_pattern.loc[exp, "pattern"] = pattern

df_pattern.reset_index(inplace=True)
df_pattern.columns = ["experiment", "pattern"]
df_pattern

In [None]:
df_pattern.groupby("pattern").size()

## Colorful visualization
Below the visualization. Experiments are grouped by their pattern.

Each group of 277 treatment siRNAs is visualized using the same nuances of a color (blue, green, purple, red).
Control siRNAs are always colored with the same nuance of the jet colormap.

### Can you find a pattern in the location of the controls??
Note: the hot red dot always close to the origin of the plot is the negative control (1138). Sometimes there's more than one negative per plate.

In [None]:
for pattern, df_pattern_pattern in df_pattern.groupby("pattern"):
    print("====================================================================")
    print(f"Pattern {pattern}\n")
    experiments = df_pattern_pattern["experiment"]
    for exp in experiments:
        df_exp = df_train.groupby("experiment").get_group(exp)
        fig, axs = plt.subplots(1, 8, figsize=(16,3))
        for plate, df_exp_pl in df_exp.groupby("plate"):
            if plate == 1:
                axs[plate-1].set_ylabel(exp)
            axs[plate-1].scatter(df_exp_pl["row"], df_exp_pl["col"], color=df_exp_pl["color"], s=30)
            axs[plate-1].set_title(f"PL. {plate} treat")
            df_exp_pl_contr = df_train_contr[(df_train_contr["experiment"]==exp) & (df_train_contr["plate"]==plate)]
            axs[plate-1+4].scatter(df_exp_pl_contr["row"], df_exp_pl_contr["col"], color=df_exp_pl_contr["color"], s=30)
            axs[plate-1+4].set_title(f"PL. {plate} contr")
        plt.show()

Check if there's any pattern in the controls.

In [None]:
# controls seem to appear in a random order in the same "scheme" of wells
# sometimes negative controls appear in a well that is normally dedicated to a treatment (it's a failed treatment)
# pick "real" control wells from a plate that seems not to have failed treatments or anomalies
control_wells = df_train_contr.loc[(df_train_contr["experiment"]=="RPE-06") & (df_train_contr["plate"]==1), "well"].values
control_wells

In [None]:
experiment_plate_contr_list = []
pattern_contr_list = []
for exp_pl, df_exp_pl in df_train_contr[df_train_contr["well"].isin(control_wells)].groupby(["experiment", "plate"]):
    experiment_plate_contr_list.append(exp_pl)
    # df_exp_pl is already sorted by row and column
    pattern_contr_list.append("_".join(df_exp_pl["sirna"].astype("str").values.tolist()))
df_pattern_contr = pd.DataFrame(data={"experiment_plate" : experiment_plate_contr_list,
                                      "pattern" : pattern_contr_list})
df_pattern_contr.head()

In [None]:
df_pattern_contr.groupby("pattern").size().sort_values(ascending=False).head(5)

The first two patterns are in fact almost the same (note that 1119 in the first pattern becomes a negative 1138 in the second pattern).
1138_1108_1109_1110_1111_1112_1113_1114_1115_1116_1117_1118_**1119**_1120_1121_1122_1123_1124_1125_1126_1127_1128_1129_1130_1131_1132_1133_1134_1135_1136_1137
1138_1108_1109_1110_1111_1112_1113_1114_1115_1116_1117_1118_**1138**_1120_1121_1122_1123_1124_1125_1126_1127_1128_1129_1130_1131_1132_1133_1134_1135_1136_1137

I inspected the remaining patterns by eye but couldn't find any hint of a scheme or something.

Below the experiments and plates with the same control pattern. All plates of HUVEC-07 and RPE-03 have the same "scheme" of control siRNAs.

In [None]:
p1 = "1138_1108_1109_1110_1111_1112_1113_1114_1115_1116_1117_1118_1119_1120_1121_1122_1123_1124_1125_1126_1127_1128_1129_1130_1131_1132_1133_1134_1135_1136_1137"
p2 = "1138_1108_1109_1110_1111_1112_1113_1114_1115_1116_1117_1118_1138_1120_1121_1122_1123_1124_1125_1126_1127_1128_1129_1130_1131_1132_1133_1134_1135_1136_1137"
df_pattern_contr[df_pattern_contr["pattern"].isin([p1,p2])]

### What happens in the test set?

In [None]:
# visualize controls in test
for exp, df_exp in df_test_contr.groupby("experiment"):
    fig, axs = plt.subplots(1, 4, figsize=(8,3))
    for plate, df_exp_pl in df_exp.groupby("plate"):
        if plate == 1:
            axs[plate-1].set_ylabel(exp)
        axs[plate-1].scatter(df_exp_pl["row"], df_exp_pl["col"], color=df_exp_pl["color"], s=30)
        axs[plate-1].set_title(f"PL. {plate} contr")
    plt.show()

In [None]:
experiment_plate_contr_test_list = []
pattern_contr_test_list = []
for exp_pl, df_exp_pl in df_test_contr[df_test_contr["well"].isin(control_wells)].groupby(["experiment", "plate"]):
    experiment_plate_contr_test_list.append(exp_pl)
    # df_exp_pl is already sorted by row and column
    pattern_contr_test_list.append("_".join(df_exp_pl["sirna"].astype("str").values.tolist()))
df_pattern_contr_test = pd.DataFrame(data={"experiment_plate" : experiment_plate_contr_test_list,
                                           "pattern" : pattern_contr_test_list})
df_pattern_contr_test.head()

In [None]:
df_pattern_contr_test.groupby("pattern").size().sort_values(ascending=False).head(5)

In [None]:
p1 = "1138_1108_1109_1110_1111_1112_1113_1114_1115_1116_1117_1118_1119_1120_1121_1122_1123_1124_1125_1126_1127_1128_1129_1130_1131_1132_1133_1134_1135_1136_1137"
df_pattern_contr_test[df_pattern_contr_test["pattern"].isin([p1])]

HUVEC-18 has also the same "scheme" of control siRNAs.

### Attempt to find the same scheme of treatment siRNAs in train set

In [None]:
experiment_plate_treat_list = []
pattern_treat_list = []
for exp_pl, df_exp_pl in df_train.groupby(["experiment", "plate"]):
    experiment_plate_treat_list.append(exp_pl)
    # df_exp_pl is already sorted by row and column
    pattern_treat_list.append("_".join(df_exp_pl["sirna"].astype("str").values.tolist()))
df_pattern_treat = pd.DataFrame(data={"experiment_plate" : experiment_plate_treat_list,
                                      "pattern" : pattern_treat_list})
df_pattern_treat.head()

In [None]:
df_pattern_treat.groupby("pattern").size().sort_values(ascending=False).head(5)

In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

m = np.zeros((len(pattern_treat_list), len(pattern_treat_list)))
for i, p1 in enumerate(pattern_treat_list):
    for j, p2 in enumerate(pattern_treat_list):
        if i<j:
            s = similar(p1, p2)
            m[i,j] = s
            if s>0.1:
                print(f"- Match found at {experiment_plate_treat_list[i]} and {experiment_plate_treat_list[j]}")
                print(f"     p1 = {p1}")
                print(f"     p2 = {p2}")
                print(f"     simil = {s}")