# 1.0 Importing Libs 

In [1]:
import pandas as pd
import numpy as np
from skimage.io import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm
import os


# 2.0 Downloading Dataset

In [None]:
import os
import kagglehub

# Create a folder called 'data' in the current directory
data_dir = os.path.join(os.getcwd(), "data")
os.makedirs(data_dir, exist_ok=True)

# Download dataset to 'data' folder
path = kagglehub.dataset_download("gpiosenka/sports-classification", download_dir=data_dir)

print("Path to dataset files:", path)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('data/sports.csv')
df

Unnamed: 0,class id,filepaths,labels,data set
0,0,train/air hockey/001.jpg,air hockey,train
1,0,train/air hockey/002.jpg,air hockey,train
2,0,train/air hockey/003.jpg,air hockey,train
3,0,train/air hockey/004.jpg,air hockey,train
4,0,train/air hockey/005.jpg,air hockey,train
...,...,...,...,...
14488,99,valid/wingsuit flying/1.jpg,wingsuit flying,valid
14489,99,valid/wingsuit flying/2.jpg,wingsuit flying,valid
14490,99,valid/wingsuit flying/3.jpg,wingsuit flying,valid
14491,99,valid/wingsuit flying/4.jpg,wingsuit flying,valid


In [4]:
# Load & flatten 224x224x3 images into 1D (150528) arrays
from skimage.transform import resize

def load_data(df_subset, base_path="data", target_shape=(224, 224, 3)):
    X, y = [], []
    for _, row in tqdm(df_subset.iterrows(), total=len(df_subset)):
        img_path = os.path.join(base_path, row['filepaths'])
        
        if not img_path.lower().endswith(".jpg"):
            continue
        
        try:
            image = imread(img_path)
            
            # Force RGB
            if image.ndim == 2:
                # Grayscale -> RGB
                image = np.stack((image,) * 3, axis=-1)
            elif image.shape[2] == 4:
                # RGBA -> RGB
                image = image[:, :, :3]
            elif image.shape[2] != 3:
                print(f"Skipping image with unexpected shape: {img_path}")
                continue
            
            # Resize to target shape
            image = resize(image, target_shape, anti_aliasing=True)
            
            X.append(image.flatten())  # flatten to 1D
            y.append(row['labels'])
        except Exception as e:
            print(f"Error loading {img_path}: {e}")
            continue
            
    return np.array(X), np.array(y)


df_train = df[df['data set'] == 'train']
df_valid = df[df['data set'] == 'valid']

X_train, y_train = load_data(df_train)
X_valid, y_valid = load_data(df_valid)


100%|██████████| 13493/13493 [03:47<00:00, 59.25it/s]
100%|██████████| 500/500 [00:08<00:00, 57.89it/s]


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_valid_enc = le.transform(y_valid)


In [6]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_valid_scaled = scaler.transform(X_valid)


In [7]:
from sklearn.decomposition import PCA
import numpy as np

# Optional: ensure memory efficiency
X_train = X_train.astype(np.float32)
X_valid = X_valid.astype(np.float32)

# PCA with whitening (does mean-centering + variance scaling)
pca = PCA(n_components=500, svd_solver='randomized', whiten=True)

X_train_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)

print("Original shape:", X_train.shape)
print("Reduced shape:", X_train_pca.shape)


Original shape: (13492, 150528)
Reduced shape: (13492, 500)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model = LogisticRegression(max_iter=1000, verbose=1)
model.fit(X_train_pca, y_train_enc)

y_pred = model.predict(X_valid_pca)
print("Accuracy:", accuracy_score(y_valid_enc, y_pred))
print(classification_report(y_valid_enc, y_pred, target_names=le.classes_))


Accuracy: 0.112
                       precision    recall  f1-score   support

           air hockey       0.00      0.00      0.00         5
      ampute football       0.50      0.20      0.29         5
              archery       0.00      0.00      0.00         5
        arm wrestling       0.43      0.60      0.50         5
         axe throwing       0.50      0.20      0.29         5
         balance beam       0.00      0.00      0.00         5
        barell racing       0.38      0.60      0.46         5
             baseball       0.00      0.00      0.00         5
           basketball       0.00      0.00      0.00         5
       baton twirling       0.00      0.00      0.00         5
            bike polo       0.00      0.00      0.00         5
            billiards       0.50      0.20      0.29         5
                  bmx       0.00      0.00      0.00         5
              bobsled       0.00      0.00      0.00         5
              bowling       0.12      

In [10]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score

# Optional: for better performance
X_train = X_train.astype(np.float32)
X_valid = X_valid.astype(np.float32)

model = SGDClassifier(loss='log_loss',  # logistic regression
                      max_iter=1000,
                      tol=1e-3,
                      n_jobs=-1,        # use all CPU cores
                      random_state=42)

model.fit(X_train, y_train_enc)

y_pred = model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid_enc, y_pred))
print(classification_report(y_valid_enc, y_pred, target_names=le.classes_))


Accuracy: 0.104
                       precision    recall  f1-score   support

           air hockey       0.00      0.00      0.00         5
      ampute football       0.33      0.20      0.25         5
              archery       0.00      0.00      0.00         5
        arm wrestling       0.27      0.60      0.38         5
         axe throwing       0.50      0.20      0.29         5
         balance beam       0.00      0.00      0.00         5
        barell racing       0.33      0.20      0.25         5
             baseball       0.00      0.00      0.00         5
           basketball       0.00      0.00      0.00         5
       baton twirling       0.00      0.00      0.00         5
            bike polo       0.00      0.00      0.00         5
            billiards       0.67      0.40      0.50         5
                  bmx       0.00      0.00      0.00         5
              bobsled       0.00      0.00      0.00         5
              bowling       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, accuracy_score

# model = LogisticRegression(multi_class='multinomial', max_iter=1000, verbose=1)
# model.fit(X_train_scaled, y_train_enc)

# y_pred = model.predict(X_valid_scaled)
# print("Accuracy:", accuracy_score(y_valid_enc, y_pred))
# print(classification_report(y_valid_enc, y_pred, target_names=le.classes_))
