In [None]:
import os
import cv2
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from scipy.stats import skew
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from scipy.stats import boxcox
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
from os import listdir
listdir("../input/")

In [None]:
base_path="../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset"
train_image_path = base_path + "/train/"
some_files = listdir(train_image_path)[0:10]
some_files


In [None]:
extensions = [".jpeg",".jpg",".JPEG"]
def path(root_dir):
    files=[]
    for (root,directory,filenames) in os.walk(root_dir):
        for name in filenames:
            if any(ext in name for ext in extensions):
                files.append(os.path.join(root,name))
    return files
        
    
              

In [None]:
train_data=path("../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/train")
test_data=path("../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/test")

In [None]:
def stats(files):
    df=pd.DataFrame(index=np.arange(len(files)),columns = ["Rows", "Columns" ,"Depth" ,"img_mean" ,
                                                           "img_skew" ,"img_std" ,"channel_mean"])

    for i in tqdm(range(len(files))):
        image_path = files[i]
        img = cv2.imread(image_path)
        
        df.iloc[i]["Rows"]=img.shape[0]
        df.iloc[i]["Columns"]=img.shape[1]
        df.iloc[i]["Depth"]=img.shape[2]
        df.iloc[i]["img_mean"]=np.mean(img.flatten())
        df.iloc[i]["img_skew"]=skew(img.flatten())
        df.iloc[i]["img_std"]=np.std(img.flatten())  
        df.iloc[i]["channel_mean"]=np.mean(img[: ,: ,0])
    return df


In [None]:
train = stats(train_data)

In [None]:
train['image_paths']=train_data

In [None]:
test = stats(test_data)


In [None]:
test['image_paths'] = test_data

In [None]:
train =stats(train_data)
print(train)

In [None]:
train_image_stats.info()

In [None]:
test_image_stats.info()

In [None]:
train_image_stats.head(10)
test_image_stats.head(10)

In [None]:

train_image_names = train.image_paths.values
test_image_names = test.image_paths.values

In [None]:
print(train_image_stats)

In [None]:
print(test_image_stats)

In [None]:
train['img_area']=train['Rows']*train['Columns']
test['img_area']=train['Rows']*train['Columns']

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,10))
ax[0].scatter(train["Rows"].values, train["Columns"].values, c="orangered")
ax[1].scatter(test["Rows"].values, test["Columns"].values, c="lightseagreen")

ax[0].set_title("Train images")
ax[1].set_title("Test images")

In [None]:
def preprocess_k_means(train, test, feature, constant, lam):
    minmax_scaler = MinMaxScaler()
    scaled_train_feature = minmax_scaler.fit_transform(train[feature].values.reshape(-1, 1))
    scaled_test_feature = minmax_scaler.fit_transform(test[feature].values.reshape(-1,1))
    
    boxcox_train_feature = boxcox(scaled_train_feature[:,0] + constant, lam)
    boxcox_test_feature = boxcox(scaled_test_feature[:,0] + constant, lam)

    scaler = StandardScaler()
    preprocessed_train_feature = scaler.fit_transform(boxcox_train_feature.reshape(-1,1))
    preprocessed_test_feature = scaler.fit_transform(boxcox_test_feature.reshape(-1,1))
    
    train.loc[:, "preprocessed_" + feature] = preprocessed_train_feature
    test.loc[:, "preprocessed_" + feature] = preprocessed_test_feature
    return train, test

In [None]:
train, test= preprocess_k_means(train, test, "channel_mean",constant=1, lam=10)

train, test = preprocess_k_means(train, test, "img_skew",  constant=0.05,lam=2)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,5))
sns.distplot(train.preprocessed_channel_mean, ax=ax[0], color="crimson", label="train")
sns.distplot(test.preprocessed_channel_mean, ax=ax[0], color="crimson", label="test")
sns.distplot(train.preprocessed_img_skew, ax=ax[1], color="crimson", label="train")
sns.distplot(test.preprocessed_img_skew, ax=ax[1], color="lightseagreen", label="test")


In [None]:
train_shapes = train.groupby(["Rows", "Columns"]).size().sort_values(ascending=False) / train.shape[0] * 100
test_shapes = test.groupby( ["Rows", "Columns"]).size().sort_values(ascending=False) / test.shape[0] * 100

In [None]:
train.shape[0] * 0.2/100

In [None]:
common_train_shapes = set(list(train_shapes[train_shapes > 0.3].index.values))
common_test_shapes = set(list(test_shapes[test_shapes > 0.3].index.values))

In [None]:
common_shape_groups = common_train_shapes.union(common_test_shapes)
common_shape_groups

In [None]:
num_clusters = len(common_shape_groups)
num_clusters

In [None]:
combined_stats = train.append(test)
combined_stats.head(1)

In [None]:
kmeans = KMeans(n_clusters=num_clusters, 
                random_state=0)

x = combined_stats.loc[:, ["img_mean", "img_std", "preprocessed_img_skew",
                           "preprocessed_channel_mean"]].values #,
                           #"img_area", "rows", "columns"]].values
cluster_labels = kmeans.fit_predict(x)
combined_stats["cluster_label"] = cluster_labels

In [None]:
train = combined_stats.iloc[0:train.shape[0]]
test= combined_stats.iloc[train.shape[0]::]

In [None]:

fig = make_subplots(rows=1, cols=2, subplot_titles=("Train  stats", "Test  stats"))

trace0 = go.Scatter(
    x = train.img_std.values,
    y = train.img_mean.values,
    mode='markers',
    text=train["cluster_label"].values,
    marker=dict(
        color=train.cluster_label.values,
        colorbar=dict(thickness=10, len=1.1, title="cluster label"),
        colorscale='Jet',
        opacity=0.4,
        size=2
    )
)

trace1 = go.Scatter(
    x = test.img_std.values,
    y = test.img_mean.values,
    mode='markers',
    text=test["cluster_label"].values,
    marker=dict(
        color=test.cluster_label.values,
        colorscale='Jet',
        opacity=0.4,
        size=2
    )
)

fig.add_trace(trace0, row=1, col=1)
fig.add_trace(trace1, row=1, col=2)

fig.update_xaxes(title_text="Image std", row=1, col=1)
fig.update_yaxes(title_text="Image mean", row=1, col=1)
fig.update_xaxes(title_text="Image std", row=1, col=2)
fig.update_yaxes(title_text="Image mean", row=1, col=2)

fig.update_layout(height=425, width=850, showlegend=False)
fig.show()