In [None]:
import numpy as np
import pandas as pd
import os
from scipy.stats import entropy
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (10,10)

In [None]:
train_data_df = pd.read_csv("/kaggle/input/hotel-id-2021-fgvc8/train.csv")
train_data_df.head()

In [None]:
print("Number of images in the dataset:",train_data_df.shape[0])

In [None]:
# Checking for duplicates
duplicate_images = train_data_df[train_data_df.duplicated(subset=['image'])==True]['image'].values
for dup in duplicate_images:
    print(train_data_df[train_data_df['image']==dup])

Two duplicates found. Most probably by the same user. As chain, hotel_id and timestamp are identical. 

In [None]:
# Check if nan values present
print("Number of NaN values present:",train_data_df.isna().sum())

In [None]:
# Chain value 0 represents individual hotels
print("Number of Unique Hotel Chains:",train_data_df['chain'].nunique()-1)

In [None]:
# Hotel Chains and how many hotel each hotel chain contain
hotel_count = {}
for hotel_chain_id in train_data_df['chain'].unique():
    key = hotel_chain_id
    value = train_data_df[train_data_df['chain']==hotel_chain_id]['hotel_id'].nunique()
    hotel_count[key] = value

#hotel_count.pop(0)
bar = plt.bar(x=hotel_count.keys(),height=hotel_count.values(),color="blueviolet")
plt.xlabel("Hotel Chain ID")
plt.ylabel("Count")
plt.title("Hotel Chains and their hotel counts")

Lots of individual hotels in the data.

In [None]:
# Number of hotels and how many images for each hotel
hotels = train_data_df['hotel_id'].unique()
hotels_image_count = []
for hotel in hotels:
    cnt = train_data_df[train_data_df['hotel_id']==hotel]['image'].nunique()
    hotels_image_count.append(cnt)

hotel_image_df = pd.DataFrame({"hotel_id":map(str,hotels),"image_count":hotels_image_count})
hotel_image_df.sort_values(by="image_count",ascending=False,inplace=True)

In [None]:
plt.figure(figsize=[15,15])
top_50_hotel_image_df = hotel_image_df.iloc[:50,:]
plt.bar(x=top_50_hotel_image_df["hotel_id"],height=top_50_hotel_image_df["image_count"],color="blueviolet")
plt.xlabel("Hotel ID")
plt.xticks(rotation=45)
plt.ylabel("Image Count")
plt.title("Hotel and their image count (Top 50)")

In [None]:
plt.figure(figsize=[15,15])
bottom_50_hotel_image_df = hotel_image_df.iloc[-50:,:]
plt.bar(x=bottom_50_hotel_image_df["hotel_id"],height=bottom_50_hotel_image_df["image_count"],color="blueviolet")
plt.xlabel("Hotel ID")
plt.xticks(rotation=45)
plt.ylabel("Image Count")
plt.title("Hotel and their image count (Bottom 50)")

Huge imbalance between top 50 and bottom 50. A quanitification will show a better picture

In [None]:
def shannon_entropy(no_of_classes,sizes,dataset_size):
    sh_en = 0
    for i in range(no_of_classes):
        sh_en += (sizes[i]/dataset_size)*np.log(sizes[i]/dataset_size)
    return -sh_en

def quant_imbalance(no_of_classes,sizes,dataset_size):
    sh_en = shannon_entropy(no_of_classes,sizes,dataset_size)
    return sh_en/no_of_classes

In [None]:
# Imbalance quantification using Shannon Entropy
print(quant_imbalance(hotel_image_df.shape[0],hotel_image_df['image_count'].values.tolist(),hotel_image_df.shape[0]))

This number quantifies how badly the data is distributed. Proper measures will have to be taken while training to avoid overfitting to a few classes. 