In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

from tqdm import tqdm
from PIL import Image

from sklearn.metrics import accuracy_score

In [None]:
ROOT_DIR='../input/happy-whale-and-dolphin'
TRAIN_DIR='../input/happy-whale-and-dolphin/train_images'
TEST_DIR='../input/happy-whale-and-dolphin/test_images'

In [None]:
def get_train_file_path(image_id):
    return f'{TRAIN_DIR}/{image_id}'
def get_test_file_path(image_id):
    return f'{TEST_DIR}/{image_id}'

In [None]:
train_df=pd.read_csv(os.path.join(ROOT_DIR,'train.csv'))
#test_df=pd.read_csv(os.path.join(ROOT_DIR,'sample_submission.csv'))

train_df

In [None]:
train_df['image_path']=train_df['image'].apply(lambda x:get_train_file_path(x))
#test_df['image_path']=test_df['image'].apply(lambda x:get_test_file_path(x))

train_df

# Image size

In [None]:
def create_shape_feature(df):
    width_height_list = []
    file_size_list = []
    for path_ in tqdm(df['image_path']):
        width_height_list.append(Image.open(path_).size)
        file_size_list.append(os.path.getsize(path_))
    df['width_height'] = width_height_list
    #print(width_height_list)
    df['file_size'] = file_size_list
    df['width'] = df['width_height'].apply(lambda x: x[0])
    df['height'] = df['width_height'].apply(lambda x: x[1])
    return df

In [None]:
train_df = create_shape_feature(train_df)
#test_df = create_shape_feature(test_df)

train_df

In [None]:
train_df['area'] = train_df['width'] * train_df['height']
train_df['size_per_ pixel'] = train_df['file_size'] / train_df['area']

#test_df['area'] = test_df['width'] * test_df['height']
#test_df['size_per_ pixel'] = test_df['file_size'] / test_df['area']

train_df

# label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['species_label']=le.fit_transform(train_df['species'])

print('species_label amount:',train_df['species_label'].nunique())
display(train_df)

# CV

In [None]:
#https://www.kaggle.com/debarshichanda/pytorch-arcface-gem-pooling-starter#Create-Folds

skf = StratifiedKFold(n_splits=5)
for fold, ( _, val_) in enumerate(skf.split(X=train_df, y=train_df['species_label'])):
    
      train_df.loc[val_ , "kfold"] = fold
train_df

# lgbm

In [None]:
params = {
    'learning_rate':0.01,
    "objective": "multiclass",
    'boosting_type': "gbdt",
    'verbosity': -1,
    'n_jobs': -1, 
    'seed': 42,
    'max_depth': 5,
    'n_estimators': 1000, 
}


for fold in range(5):
    train=train_df[train_df['kfold']!=fold]
    valid=train_df[train_df['kfold']==fold]

    X_train=train.drop(['image','species','individual_id','image_path','width_height','kfold','species_label'],axis=1)
    y_train=train['species_label']
    X_valid=valid.drop(['image','species','individual_id','image_path','width_height','kfold','species_label'],axis=1)
    y_valid=valid['species_label']

    model=lgb.LGBMClassifier(**params)
    model.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_valid,y_valid)],verbose=1000,early_stopping_rounds=15)
    pred=model.predict(X_valid)

In [None]:
#feature_importance
fi=model.feature_importances_

lgb_imp = pd.DataFrame()
lgb_imp['Image feature'] = X_train.columns
lgb_imp['importance'] = fi

plt.figure(figsize=(5,5))
sns.barplot(x="importance", y="Image feature",data=lgb_imp.sort_values(by="importance",ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()

In [None]:
#accuracy
acc = accuracy_score(y_valid,pred)
print('accuracy:',acc)