In [None]:
import pandas as pd 
import numpy as np 
import wandb 
from kaggle_secrets import UserSecretsClient
import matplotlib.pyplot as plt 
import seaborn as sns 
import cv2
from PIL import Image 
from wordcloud import WordCloud 
from glob import glob 

from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE 

# Setup tools 

In [None]:
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api") 
wandb.login(key=wandb_api)

In [None]:
config = dict(
    competition="Pawpularity", 
    infra = "kaggle",
    seed = 42 
)
run = wandb.init(project="Pawpularity", 
          config=config, 
          group="None",
          job_type="EDA")

# EDA 

In [None]:
train = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
test = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
train.head()

In [None]:

'''
The part that is partially raised at the end is conspicuous
'''

plt.figure(figsize=(15, 6))
plt.subplot(121)
sns.histplot(train.Pawpularity)
plt.subplot(122)
sns.boxplot(train.Pawpularity)

In [None]:
train.corr().loc[:, ["Pawpularity"]].style.background_gradient(cmap="coolwarm")

In [None]:
train.isnull().sum() / train.shape[0]

In [None]:

'''
merge data (dog and cat)? 
'''

dog_cat = pd.read_csv("../input/pawpularity-cat-or-dog/cat_class.csv")

train = pd.merge(train, dog_cat, how="left", on="Id")
train.head()

In [None]:
train.groupby("is_cat").mean().style.background_gradient(cmap="coolwarm")

In [None]:
plt.figure(figsize=(15, 6))

plt.subplot(121)
sns.histplot(train.loc[train.is_cat == 0, "Pawpularity"])
plt.title("Dog")
plt.subplot(122)
sns.histplot(train.loc[train.is_cat == 1, "Pawpularity"])
plt.title("Cat")
plt.show()

# Show image Under/Top

In [None]:

# add file path 
train["img_file"] = [f"../input/petfinder-pawpularity-score/train/{f}.jpg" for f in train.Id]
test["img_file"] = [f"../input/petfinder-pawpularity-score/test/{f}.jpg" for f in test.Id]


def viz_rank_img(type="dog", ascending=True, n=9):
    x = train[train.is_cat == (0 if type == "dog" else 1)].sort_values("Pawpularity", ascending=ascending)[:n]
    x_img = x["img_file"].to_list()
    
    fig, axes = plt.subplots(3, 3, figsize=(10, 10))
    ax = axes.ravel()
    for i in range(n):
        img = cv2.imread(x_img[i])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (224, 224))
        ax[i].imshow(img)
        ax[i].set_xticks([])
        ax[i].set_yticks([])
    plt.subplots_adjust(wspace=0, hspace=0)
    plt.title(type)
    plt.show()
    
    
'''
I looked at some of the images but couldn't figure out 
how they affected the objective variable.
'''

In [None]:
viz_rank_img() # Under 10 category dog 

In [None]:
viz_rank_img(ascending=False) # top 10 category dog 

In [None]:
viz_rank_img(type="cat") # under 10 category cat 

In [None]:
viz_rank_img(type="cat", ascending=False) # top 10 category cat 

# T-SNE and Clustering 

In [None]:

'''
Some parts are cohesive and some are not.
'''

use_col = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
       'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']

tsne = TSNE(n_components=2, random_state=config["seed"])

t_train = tsne.fit_transform(train[use_col])
t_test = tsne.fit_transform(test[use_col])

plt.figure(figsize=(15, 6))
plt.subplot(121)
plt.title("Train")
plt.scatter(x=t_train[:, 0], y=t_train[:, 1])
plt.subplot(122)
plt.title("Test")
plt.scatter(x=t_test[:, 0], y=t_test[:, 1])

In [None]:

# tsne dataframe 
t_train = pd.DataFrame(t_train, index=train.Id, columns=["c"+str(c) for c in range(2)])
t_test = pd.DataFrame(t_test, index=test.Id, columns=["c"+str(c) for c in range(2)])

km = KMeans(n_clusters=2, random_state=config["seed"]).fit(t_train)
y_km = km.predict(t_train)
y_km_ = km.predict(t_test)

t_train["cluster"] = y_km 
t_test["cluster"] = y_km_ 


plt.figure(figsize=(15, 6))
plt.subplot(121)
plt.title("Train")
sns.scatterplot(data=t_train, x="c0", y="c1", hue="cluster")
plt.subplot(122)
plt.title("Test")
sns.scatterplot(data=t_test, x="c0", y="c1", hue="cluster")

In [None]:

train = pd.merge(train, t_train.reset_index().drop(["c0", "c1"], axis=1), how="left", on="Id")
test = pd.merge(test, t_test.reset_index().drop(["c0", "c1"], axis=1), how="left", on="Id")


def show_cloud(cnt2animal, c):
    word = WordCloud(background_color="white", contour_color="blue", width=1500, height=750).generate_from_frequencies(cnt2animal)
    plt.imshow(word)
    plt.title(f"Cluster={c}")
    plt.axis("off")
    plt.show()
    
def find_cluster_feature(c_name = 1):
    c = train[train.cluster == c_name]
    cnt2animal = {}
    for animal in c.is_cat.to_list():
        text = "dog" if animal == 0 else "cat"
        if text not in cnt2animal:
            cnt2animal[text] = 1 
        else:
            cnt2animal[text] += 1 
    show_cloud(cnt2animal, c_name)
    
    use_col_ = use_col.copy()
    use_col_.append("Pawpularity")
  
    for col in use_col_:
        feature_mu = c[col].mean()
        cnt2animal[col] = feature_mu 
    return pd.DataFrame({"feature": [k for k, v in cnt2animal.items()], 
                         "mean": [v for  k, v in cnt2animal.items()]}).style.background_gradient(cmap="coolwarm")


In [None]:
find_cluster_feature()

In [None]:

'''
The level of each feature is lower than that of cluster 1.
There is no count difference between dogs and cats and objective variables.
'''

find_cluster_feature(0)

In [None]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

for f in glob("*.csv"):
    artifact = wandb.Artifact(name='eda', type='dataset')
    artifact.add_file(f)
    run.log_artifact(artifact)