In [None]:
from shutil import copyfile

copyfile(src = "../input/img2vector/img_to_vec.py", dst = "./img_to_vec.py")

from img_to_vec import *

In [None]:
import numpy as np
import pandas as pd
import os
import torch
from PIL import Image
from tqdm import tqdm, tqdm_notebook 

from xgboost import XGBRegressor

In [None]:
TRAIN_PATH = "../input/petfinder-pawpularity-score/train.csv"
TEST_PATH = "../input/petfinder-pawpularity-score/test.csv"
TRAIN_IMAGE_FOLDER = "../input/petfinder-pawpularity-score/train"
TEST_IMAGE_FOLDER = "../input/petfinder-pawpularity-score/test"

SAMPLE_SUBMISSION_PATH = "../input/petfinder-pawpularity-score/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "Id"
TARGET = "Pawpularity"
IMAGE_TYPE = 'jpg'
TREE_METHOD = 'gpu_hist'
PROJECT_NAME = "Petfinder"

In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [None]:
def fe_ima2vec(df, img_path, img_col, filename_extension):
    
    img2vec = Img2Vec(cuda=(torch.cuda.is_available()))
    
    fe_img = []
    for idx in tqdm(range(len(df))):
        img = Image.open(f'{img_path}/{df.loc[idx, img_col]}.{filename_extension}')
        vec = img2vec.get_vec(img)
        fe_img.append(vec)
    
    fe_img = pd.DataFrame(fe_img)
    fe_img.columns = ['img_vec_' + str(i) for i in range(fe_img.shape[1])]
    return fe_img

In [None]:
fe_train = fe_ima2vec(train, TRAIN_IMAGE_FOLDER, ID, IMAGE_TYPE)
fe_train.head()

In [None]:
fe_test = fe_ima2vec(test, TEST_IMAGE_FOLDER, ID, IMAGE_TYPE)
fe_test.head()

In [None]:
fe_train.columns

In [None]:
fe_train.shape,train.shape

In [None]:
fe_test.shape,test.shape

# image + data  => new dataframe 

In [None]:
new_train = pd.concat([train,fe_train],axis=1)
new_test = pd.concat([test,fe_test],axis=1)
new_train.shape,new_test.shape

In [None]:
def getLabelCount(df,target):
    return sorted([( labelValue,len(train.loc[df[target] == labelValue]) ) for labelValue in df[target].unique()])

getLabelCount(new_train,TARGET)

In [None]:
new_train.to_csv("new_train.csv",index=False)
new_test.to_csv("new_test.csv",index=False)

reference notebook : https://www.kaggle.com/rhythmcam/function-make-training-folds-csv

In [None]:
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
def makeNFoldCSV(df,target_col,project_name,nFold=5,folder_col_name="Fold",seed=2021,shuffle=True):
    skf = StratifiedKFold(n_splits=nFold,random_state=seed, shuffle=shuffle)
    skfSplit = skf.split(df, df[target_col])
    fold = 1
    for train_index, test_index in skfSplit:
        df.loc[test_index,folder_col_name] = fold
        fold = fold + 1

    df = df.astype({folder_col_name: 'int64'})
    csv_name = project_name + "_" + str(nFold) + "folds.csv"
    df.to_csv(csv_name,index = False)
    
def getFoldCount(df,col="Fold"):
    return sorted([( colValue,len(train.loc[df[col] == colValue]) ) for colValue in df[col].unique()])

# 5 fold

In [None]:
makeNFoldCSV(new_train,TARGET,PROJECT_NAME)
fold_csv = pd.read_csv("./Petfinder_5folds.csv")
print(getFoldCount(fold_csv))
ax = sns.countplot(data=fold_csv,y="Fold")

# 4 fold

In [None]:
makeNFoldCSV(new_train,TARGET,PROJECT_NAME,nFold = 4)
fold_csv = pd.read_csv("./Petfinder_4folds.csv")
print(getFoldCount(fold_csv))
ax = sns.countplot(data=fold_csv,y="Fold")

# 10 fold

In [None]:
makeNFoldCSV(new_train,TARGET,PROJECT_NAME,nFold = 10)
fold_csv = pd.read_csv("./Petfinder_10folds.csv")
print(getFoldCount(fold_csv))
ax = sns.countplot(data=fold_csv,y="Fold")

# 20 fold

In [None]:
makeNFoldCSV(new_train,TARGET,PROJECT_NAME,nFold = 20)
fold_csv = pd.read_csv("./Petfinder_20folds.csv")
print(getFoldCount(fold_csv))
ax = sns.countplot(data=fold_csv,y="Fold")