# Implications for Other Tasks

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
import joblib
import os

# For Google Colab environment
try:
    from google.colab import drive
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# Define bowling styles (same as provided)
bowler_styles = {
    'Nuwan Kulasekara': 'Fast', 'Lasith Malinga': 'Fast', 'Angelo Mathews': 'Medium', 'Rangana Herath': 'Spin',
    'Tillakaratne Dilshan': 'Spin', 'Suranga Lakmal': 'Fast', 'Jeevan Mendis': 'Spin', 'Tim Southee': 'Fast',
    'Trent Boult': 'Fast', 'Adam Milne': 'Fast', 'Daniel Vettori': 'Spin', 'Grant Elliott': 'Medium',
    'Kane Williamson': 'Spin', 'Corey Anderson': 'Fast', 'James Anderson': 'Fast', 'Stuart Broad': 'Fast',
    'Chris Woakes': 'Fast', 'Steven Finn': 'Fast', 'Moeen Ali': 'Spin', 'Joe Root': 'Spin', 'Mitchell Starc': 'Fast',
    'Josh Hazlewood': 'Fast', 'Mitchell Johnson': 'Fast', 'Mitchell Marsh': 'Fast', 'Shane Watson': 'Fast',
    'Glenn Maxwell': 'Spin', 'Steven Smith': 'Spin', 'Tinashe Panyangara': 'Fast', 'Tendai Chatara': 'Fast',
    'Solomon Mire': 'Medium', 'Elton Chigumbura': 'Medium', 'Sean Williams': 'Spin', 'Tafadzwa Kamungozi': 'Spin',
    'Sikandar Raza': 'Spin', 'Hamilton Masakadza': 'Medium', 'Vernon Philander': 'Fast', 'Morne Morkel': 'Fast',
    'Dale Steyn': 'Fast', 'Farhaan Behardien': 'Medium', 'Jean-Paul Duminy': 'Spin', 'Imran Tahir': 'Spin',
    'Mohammad Irfan': 'Fast', 'Sohail Khan': 'Fast', 'Shahid Afridi': 'Spin', 'Wahab Riaz': 'Fast',
    'Yasir Shah': 'Spin', 'Haris Sohail': 'Spin', 'Umesh Yadav': 'Fast', 'Mohammed Shami': 'Fast',
    'Mohit Sharma': 'Fast', 'Suresh Raina': 'Spin', 'Ravichandran Ashwin': 'Spin', 'Ravindra Jadeja': 'Spin',
    'John Mooney': 'Medium', 'Max Sorensen': 'Fast', 'Andy McBrine': 'Spin', 'Kevin O\'Brien': 'Medium',
    'George Dockrell': 'Spin', 'Paul Stirling': 'Spin', 'Jason Holder': 'Fast', 'Kemar Roach': 'Fast',
    'Jerome Taylor': 'Fast', 'Andre Russell': 'Fast', 'Chris Gayle': 'Spin', 'Darren Sammy': 'Medium',
    'Marlon Samuels': 'Spin', 'Lendl Simmons': 'Medium', 'Iain Wardlaw': 'Fast', 'Rob Taylor': 'Medium',
    'Josh Davey': 'Fast', 'Majid Haq': 'Spin', 'Hamid Hassan': 'Fast', 'Shapoor Zadran': 'Fast',
    'Aftab Alam': 'Fast', 'Mirwais Ashraf': 'Medium', 'Mohammad Nabi': 'Spin', 'Javed Ahmadi': 'Spin',
    'Samiullah Shenwari': 'Spin', 'Mashrafe Mortaza': 'Fast', 'Rubel Hossain': 'Fast', 'Taskin Ahmed': 'Fast',
    'Shakib Al Hasan': 'Spin', 'Mahmudullah': 'Spin', 'Soumya Sarkar': 'Medium', 'Sabbir Rahman': 'Spin',
    'Mohammad Naveed': 'Fast', 'Amjad Javed': 'Medium', 'Nasir Aziz': 'Spin', 'Mohammad Tauqir': 'Spin',
    'Krishna Chandran': 'Medium', 'Rohan Mustafa': 'Spin', 'Sohaib Maqsood': 'Medium', 'Sulieman Benn': 'Spin',
    'Thisara Perera': 'Fast', 'Dawlat Zadran': 'Fast', 'Asghar Stanikzai': 'Medium', 'Wayne Parnell': 'Fast',
    'Alasdair Evans': 'Fast', 'Richie Berrington': 'Medium', 'Matt Machan': 'Spin', 'Kyle Coetzer': 'Medium',
    'Nikita Miller': 'Spin', 'Alex Cusack': 'Medium', 'Manjula Guruge': 'Fast', 'Gulbadin Naib': 'Medium',
    'Kyle Abbott': 'Fast', 'Faf du Plessis': 'Spin', 'Pat Cummins': 'Fast', 'Bhuvneshwar Kumar': 'Fast',
    'Tawanda Mupariwa': 'Medium', 'Rahat Ali': 'Fast', 'Rilee Rossouw': 'Medium', 'AB de Villiers': 'Medium',
    'Khurram Khan': 'Spin', 'Nawroz Mangal': 'Spin', 'Michael Clarke': 'Spin', 'James Faulkner': 'Fast',
    'Nasir Hossain': 'Spin', 'Dwayne Smith': 'Medium', 'Sachithra Senanayake': 'Spin', 'Seekkuge Prasanna': 'Spin',
    'Xavier Doherty': 'Spin', 'Chris Jordan': 'Fast', 'Arafat Sunny': 'Spin', 'Rohit Sharma': 'Spin',
    'Stuart Thompson': 'Medium', 'Michael Leask': 'Spin', 'Dushmantha Chameera': 'Fast', 'Kamran Shazad': 'Fast',
    'Fahad Alhashmi': 'Fast', 'Shaiman Anwar': 'Medium', 'Mitigation McClenaghan': 'Fast', 'Taijul Islam': 'Spin',
    'Ravi Bopara': 'Medium', 'James Tredwell': 'Spin', 'Ehsan Adil': 'Fast', 'Tharindu Kaushal': 'Spin',
    'Matt Henry': 'Fast', 'Virat Kohli': 'Medium'
}

# Mount Google Drive if in Colab
if IN_COLAB:
    drive.mount('/content/drive')
    dataset_path = '/content/combined.csv'  # Adjust if needed
    save_path_colab = '/content/drive/My Drive/lstm_embeddings/'
    os.makedirs(save_path_colab, exist_ok=True)
else:
    dataset_path = 'combined.csv'  # Adjust for local path
    save_path_local = './lstm_embeddings/'
    os.makedirs(save_path_local, exist_ok=True)

# Load dataset
df = pd.read_csv(dataset_path)

# Add bowler_type feature
df['bowler_type'] = df['bowler_name'].map(bowler_styles)

# Handle missing values
numerical_cols = ['landing_x', 'landing_y', 'ended_x', 'ended_y', 'ball_speed', 'ovr']
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())  # Explicit assignment for pandas 3.0 compatibility
df['bowler_type'] = df['bowler_type'].fillna('Unknown')

# Define deceptive deliveries
ewma_features = ['landing_x', 'landing_y', 'ended_x', 'ended_y', 'ball_speed']
alpha = 0.2
for feature in ewma_features:
    df[f'{feature}_ewma'] = df.groupby('bowler_type')[feature].transform(
        lambda x: x.ewm(alpha=alpha, adjust=False).mean()
    )
    df[f'{feature}_deviation'] = df[feature] - df[f'{feature}_ewma']
    std_dev = df.groupby('bowler_type')[f'{feature}_deviation'].transform('std')
    df[f'{feature}_is_deceptive'] = (df[f'{feature}_deviation'].abs() > 2 * std_dev).astype(int)
df['is_deceptive'] = df[[f'{feat}_is_deceptive' for feat in ewma_features]].max(axis=1)

# Encode bowler_type before building sequences
le = LabelEncoder()
df['bowler_type_encoded'] = le.fit_transform(df['bowler_type'])

# Feature engineering - use bowler_type_encoded instead of bowler_type
features = ['landing_x', 'landing_y', 'ended_x', 'ended_y', 'ball_speed', 'bowler_type_encoded', 'ovr']
sequence_length = 5
sequences = []
labels = []
sequence_ids = []

# Create match_id proxy
df['match_id'] = df.groupby(['batting_team', 'bowling_team', 'inning']).ngroup()
df = df.sort_values(['match_id', 'ovr'])

# Build sequences
SEQ = 5
Xs, ys = [], []
for mid in df['match_id'].unique():
    mdf = df[df['match_id'] == mid]
    for bt in mdf['bowler_type'].unique():
        bdf = mdf[mdf['bowler_type'] == bt]
        for i in range(SEQ, len(bdf)):
            Xs.append(bdf.iloc[i-SEQ:i][features].values)
            ys.append(bdf.iloc[i]['is_deceptive'])
X = np.array(Xs, dtype=np.float32)
y = np.array(ys, dtype=np.int32)

# Normalize numeric channels
sc = StandardScaler()
for idx in [0, 1, 2, 3, 4, 6]:  # Indices of numerical features
    flat = X[:, :, idx].reshape(-1, 1)
    X[:, :, idx] = sc.fit_transform(flat).reshape(X.shape[0], SEQ)
joblib.dump(sc, 'scaler_lstm_seq5.pkl')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build Functional LSTM
inp = Input(shape=(SEQ, len(features)), name='inp')
x1 = LSTM(64, name='lstm')(inp)
x2 = Dropout(0.2, name='drop')(x1)
x3 = Dense(32, activation='relu', name='dense')(x2)
out = Dense(1, activation='sigmoid', name='out')(x3)
model = Model(inputs=inp, outputs=out)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train with checkpoint
ckpt = ModelCheckpoint('lstm_model_seq5.h5',
                       monitor='val_accuracy', save_best_only=True, verbose=1)
history = model.fit(
    X_train, y_train,
    epochs=20, batch_size=128,
    validation_split=0.2,
    callbacks=[ckpt],
    verbose=2
)

# Load best model
model.load_weights('lstm_model_seq5.h5')

# Extract embeddings
feat_ext = Model(inputs=model.input, outputs=model.get_layer('lstm').output)
Xtr_feat = feat_ext.predict(X_train, verbose=0)
Xte_feat = feat_ext.predict(X_test, verbose=0)

# Save embeddings
# Local save
joblib.dump((Xtr_feat, y_train), 'lstm_embed_train_seq5.pkl')
joblib.dump((Xte_feat, y_test), 'lstm_embed_test_seq5.pkl')

# Save to Google Drive if in Colab
if IN_COLAB:
    joblib.dump((Xtr_feat, y_train), os.path.join(save_path_colab, 'lstm_embed_train_seq5.pkl'))
    joblib.dump((Xte_feat, y_test), os.path.join(save_path_colab, 'lstm_embed_test_seq5.pkl'))
    print(f"Embeddings saved to Google Drive at: {save_path_colab}")
else:
    joblib.dump((Xtr_feat, y_train), os.path.join(save_path_local, 'lstm_embed_train_seq5.pkl'))
    joblib.dump((Xte_feat, y_test), os.path.join(save_path_local, 'lstm_embed_test_seq5.pkl'))
    print(f"Embeddings saved locally at: {save_path_local}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1/20

Epoch 1: val_accuracy improved from -inf to 0.84570, saving model to lstm_model_seq5.h5




123/123 - 5s - 44ms/step - accuracy: 0.8130 - loss: 0.4850 - val_accuracy: 0.8457 - val_loss: 0.4240
Epoch 2/20

Epoch 2: val_accuracy improved from 0.84570 to 0.84672, saving model to lstm_model_seq5.h5




123/123 - 1s - 6ms/step - accuracy: 0.8394 - loss: 0.4330 - val_accuracy: 0.8467 - val_loss: 0.4176
Epoch 3/20

Epoch 3: val_accuracy improved from 0.84672 to 0.84749, saving model to lstm_model_seq5.h5




123/123 - 1s - 10ms/step - accuracy: 0.8413 - loss: 0.4291 - val_accuracy: 0.8475 - val_loss: 0.4204
Epoch 4/20

Epoch 4: val_accuracy improved from 0.84749 to 0.84800, saving model to lstm_model_seq5.h5




123/123 - 1s - 10ms/step - accuracy: 0.8415 - loss: 0.4275 - val_accuracy: 0.8480 - val_loss: 0.4161
Epoch 5/20

Epoch 5: val_accuracy did not improve from 0.84800
123/123 - 1s - 10ms/step - accuracy: 0.8411 - loss: 0.4251 - val_accuracy: 0.8470 - val_loss: 0.4152
Epoch 6/20

Epoch 6: val_accuracy did not improve from 0.84800
123/123 - 2s - 12ms/step - accuracy: 0.8419 - loss: 0.4248 - val_accuracy: 0.8475 - val_loss: 0.4132
Epoch 7/20

Epoch 7: val_accuracy did not improve from 0.84800
123/123 - 1s - 10ms/step - accuracy: 0.8419 - loss: 0.4236 - val_accuracy: 0.8477 - val_loss: 0.4128
Epoch 8/20

Epoch 8: val_accuracy did not improve from 0.84800
123/123 - 2s - 18ms/step - accuracy: 0.8426 - loss: 0.4228 - val_accuracy: 0.8465 - val_loss: 0.4110
Epoch 9/20

Epoch 9: val_accuracy did not improve from 0.84800
123/123 - 1s - 8ms/step - accuracy: 0.8426 - loss: 0.4216 - val_accuracy: 0.8470 - val_loss: 0.4110
Epoch 10/20

Epoch 10: val_accuracy did not improve from 0.84800
123/123 - 1s - 

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
import tensorflow as tf
import joblib
import matplotlib.pyplot as plt

# Load the dataset and preprocess
df = pd.read_csv('combined.csv')

# Define bowling styles (same as provided)
bowler_styles = {
    'Nuwan Kulasekara': 'Fast', 'Lasith Malinga': 'Fast', 'Angelo Mathews': 'Medium', 'Rangana Herath': 'Spin',
    'Tillakaratne Dilshan': 'Spin', 'Suranga Lakmal': 'Fast', 'Jeevan Mendis': 'Spin', 'Tim Southee': 'Fast',
    'Trent Boult': 'Fast', 'Adam Milne': 'Fast', 'Daniel Vettori': 'Spin', 'Grant Elliott': 'Medium',
    'Kane Williamson': 'Spin', 'Corey Anderson': 'Fast', 'James Anderson': 'Fast', 'Stuart Broad': 'Fast',
    'Chris Woakes': 'Fast', 'Steven Finn': 'Fast', 'Moeen Ali': 'Spin', 'Joe Root': 'Spin', 'Mitchell Starc': 'Fast',
    'Josh Hazlewood': 'Fast', 'Mitchell Johnson': 'Fast', 'Mitchell Marsh': 'Fast', 'Shane Watson': 'Fast',
    'Glenn Maxwell': 'Spin', 'Steven Smith': 'Spin', 'Tinashe Panyangara': 'Fast', 'Tendai Chatara': 'Fast',
    'Solomon Mire': 'Medium', 'Elton Chigumbura': 'Medium', 'Sean Williams': 'Spin', 'Tafadzwa Kamungozi': 'Spin',
    'Sikandar Raza': 'Spin', 'Hamilton Masakadza': 'Medium', 'Vernon Philander': 'Fast', 'Morne Morkel': 'Fast',
    'Dale Steyn': 'Fast', 'Farhaan Behardien': 'Medium', 'Jean-Paul Duminy': 'Spin', 'Imran Tahir': 'Spin',
    'Mohammad Irfan': 'Fast', 'Sohail Khan': 'Fast', 'Shahid Afridi': 'Spin', 'Wahab Riaz': 'Fast',
    'Yasir Shah': 'Spin', 'Haris Sohail': 'Spin', 'Umesh Yadav': 'Fast', 'Mohammed Shami': 'Fast',
    'Mohit Sharma': 'Fast', 'Suresh Raina': 'Spin', 'Ravichandran Ashwin': 'Spin', 'Ravindra Jadeja': 'Spin',
    'John Mooney': 'Medium', 'Max Sorensen': 'Fast', 'Andy McBrine': 'Spin', 'Kevin O\'Brien': 'Medium',
    'George Dockrell': 'Spin', 'Paul Stirling': 'Spin', 'Jason Holder': 'Fast', 'Kemar Roach': 'Fast',
    'Jerome Taylor': 'Fast', 'Andre Russell': 'Fast', 'Chris Gayle': 'Spin', 'Darren Sammy': 'Medium',
    'Marlon Samuels': 'Spin', 'Lendl Simmons': 'Medium', 'Iain Wardlaw': 'Fast', 'Rob Taylor': 'Medium',
    'Josh Davey': 'Fast', 'Majid Haq': 'Spin', 'Hamid Hassan': 'Fast', 'Shapoor Zadran': 'Fast',
    'Aftab Alam': 'Fast', 'Mirwais Ashraf': 'Medium', 'Mohammad Nabi': 'Spin', 'Javed Ahmadi': 'Spin',
    'Samiullah Shenwari': 'Spin', 'Mashrafe Mortaza': 'Fast', 'Rubel Hossain': 'Fast', 'Taskin Ahmed': 'Fast',
    'Shakib Al Hasan': 'Spin', 'Mahmudullah': 'Spin', 'Soumya Sarkar': 'Medium', 'Sabbir Rahman': 'Spin',
    'Mohammad Naveed': 'Fast', 'Amjad Javed': 'Medium', 'Nasir Aziz': 'Spin', 'Mohammad Tauqir': 'Spin',
    'Krishna Chandran': 'Medium', 'Rohan Mustafa': 'Spin', 'Sohaib Maqsood': 'Medium', 'Sulieman Benn': 'Spin',
    'Thisara Perera': 'Fast', 'Dawlat Zadran': 'Fast', 'Asghar Stanikzai': 'Medium', 'Wayne Parnell': 'Fast',
    'Alasdair Evans': 'Fast', 'Richie Berrington': 'Medium', 'Matt Machan': 'Spin', 'Kyle Coetzer': 'Medium',
    'Nikita Miller': 'Spin', 'Alex Cusack': 'Medium', 'Manjula Guruge': 'Fast', 'Gulbadin Naib': 'Medium',
    'Kyle Abbott': 'Fast', 'Faf du Plessis': 'Spin', 'Pat Cummins': 'Fast', 'Bhuvneshwar Kumar': 'Fast',
    'Tawanda Mupariwa': 'Medium', 'Rahat Ali': 'Fast', 'Rilee Rossouw': 'Medium', 'AB de Villiers': 'Medium',
    'Khurram Khan': 'Spin', 'Nawroz Mangal': 'Spin', 'Michael Clarke': 'Spin', 'James Faulkner': 'Fast',
    'Nasir Hossain': 'Spin', 'Dwayne Smith': 'Medium', 'Sachithra Senanayake': 'Spin', 'Seekkuge Prasanna': 'Spin',
    'Xavier Doherty': 'Spin', 'Chris Jordan': 'Fast', 'Arafat Sunny': 'Spin', 'Rohit Sharma': 'Spin',
    'Stuart Thompson': 'Medium', 'Michael Leask': 'Spin', 'Dushmantha Chameera': 'Fast', 'Kamran Shazad': 'Fast',
    'Fahad Alhashmi': 'Fast', 'Shaiman Anwar': 'Medium', 'Mitigation McClenaghan': 'Fast', 'Taijul Islam': 'Spin',
    'Ravi Bopara': 'Medium', 'James Tredwell': 'Spin', 'Ehsan Adil': 'Fast', 'Tharindu Kaushal': 'Spin',
    'Matt Henry': 'Fast', 'Virat Kohli': 'Medium'
}

# Add bowler_type feature
df['bowler_type'] = df['bowler_name'].map(bowler_styles)

# Handle missing values
numerical_cols = ['landing_x', 'landing_y', 'ended_x', 'ended_y', 'ball_speed', 'ovr']
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())  # Explicit assignment for pandas 3.0 compatibility
df['bowler_type'] = df['bowler_type'].fillna('Unknown')

# Define deceptive deliveries
ewma_features = ['landing_x', 'landing_y', 'ended_x', 'ended_y', 'ball_speed']
alpha = 0.2
for feature in ewma_features:
    df[f'{feature}_ewma'] = df.groupby('bowler_type')[feature].transform(
        lambda x: x.ewm(alpha=alpha, adjust=False).mean()
    )
    df[f'{feature}_deviation'] = df[feature] - df[f'{feature}_ewma']
    std_dev = df.groupby('bowler_type')[f'{feature}_deviation'].transform('std')
    df[f'{feature}_is_deceptive'] = (df[f'{feature}_deviation'].abs() > 2 * std_dev).astype(int)
df['is_deceptive'] = df[[f'{feat}_is_deceptive' for feat in ewma_features]].max(axis=1)

# Create match_id proxy
df['match_id'] = df.groupby(['batting_team', 'bowling_team', 'inning']).ngroup()
df = df.sort_values(['match_id', 'ovr'])

# Encode bowler_type before defining features
le = LabelEncoder()
df['bowler_type_encoded'] = le.fit_transform(df['bowler_type'])

# Feature engineering - use bowler_type_encoded instead of bowler_type
features = ['landing_x', 'landing_y', 'ended_x', 'ended_y', 'ball_speed', 'bowler_type_encoded', 'ovr']
sequence_length = 5
sequences = []
labels = []
sequence_ids = []

# Build sequences for seq_len=5
SEQ = 5
Xs, ys = [], []
for mid in df['match_id'].unique():
    mdf = df[df['match_id'] == mid]
    for bt in mdf['bowler_type'].unique():
        bdf = mdf[mdf['bowler_type'] == bt]
        for i in range(SEQ, len(bdf)):
            Xs.append(bdf.iloc[i-SEQ:i][features].values)
            ys.append(bdf.iloc[i]['is_deceptive'])
X = np.array(Xs, dtype=np.float32)
y = np.array(ys, dtype=np.int32)

# Normalize numeric channels
sc = StandardScaler()
for idx in [0, 1, 2, 3, 4, 6]:  # Indices for numerical features (excluding bowler_type_encoded at index 5)
    flat = X[:, :, idx].reshape(-1, 1)
    X[:, :, idx] = sc.fit_transform(flat).reshape(X.shape[0], SEQ)
joblib.dump(sc, f'scaler_lstm_seq{SEQ}.pkl')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load existing embeddings and ensure y_train, y_test are numerical
Xtr_feat, y_train_loaded = joblib.load('lstm_embed_train_seq5.pkl')
Xte_feat, y_test_loaded = joblib.load('lstm_embed_test_seq5.pkl')

# Explicitly cast y_train and y_test to float32
y_train = np.array(y_train_loaded, dtype=np.float32)
y_test = np.array(y_test_loaded, dtype=np.float32)

# Debug: Print data types to confirm
print("y_train dtype:", y_train.dtype)
print("y_test dtype:", y_test.dtype)
print("X_train dtype:", X_train.dtype)
print("X_test dtype:", X_test.dtype)

# 1. Zero-Shot Learning (ZSL)
# Simulate unseen bowler types by excluding some types during training
seen_bowler_types = ['Fast', 'Spin']  # Train on these
unseen_bowler_types = ['Medium']  # Test on these

# Split data based on bowler types
train_indices = df[df['bowler_type'].isin(seen_bowler_types)].index
test_indices = df[df['bowler_type'].isin(unseen_bowler_types)].index

# Rebuild sequences for ZSL
Xs_seen, ys_seen = [], []
Xs_unseen, ys_unseen = [], []
for mid in df['match_id'].unique():
    mdf = df[df['match_id'] == mid]
    for bt in mdf['bowler_type'].unique():
        bdf = mdf[mdf['bowler_type'] == bt]
        for i in range(SEQ, len(bdf)):
            seq_data = bdf.iloc[i-SEQ:i][features].values
            label = bdf.iloc[i]['is_deceptive']
            if bdf.iloc[i].name in train_indices:
                Xs_seen.append(seq_data)
                ys_seen.append(label)
            elif bdf.iloc[i].name in test_indices:
                Xs_unseen.append(seq_data)
                ys_unseen.append(label)

X_seen = np.array(Xs_seen, dtype=np.float32)
y_seen = np.array(ys_seen, dtype=np.int32)
X_unseen = np.array(Xs_unseen, dtype=np.float32)
y_unseen = np.array(ys_unseen, dtype=np.int32)

# Normalize for ZSL
for idx in [0, 1, 2, 3, 4, 6]:
    flat_seen = X_seen[:, :, idx].reshape(-1, 1)
    X_seen[:, :, idx] = sc.transform(flat_seen).reshape(X_seen.shape[0], SEQ)
    flat_unseen = X_unseen[:, :, idx].reshape(-1, 1)
    X_unseen[:, :, idx] = sc.transform(flat_unseen).reshape(X_unseen.shape[0], SEQ)

# Extract embeddings for ZSL
inp = Input(shape=(SEQ, len(features)), name='input_seq')
x1 = LSTM(64, name='lstm_layer')(inp)
x2 = Dropout(0.2, name='dropout')(x1)
feat_ext = Model(inputs=inp, outputs=x1)
X_seen_embed = feat_ext.predict(X_seen, verbose=0)
X_unseen_embed = feat_ext.predict(X_unseen, verbose=0)

# Simple ZSL: Use mean embeddings of seen classes as prototypes
prototypes = {}
for bt in seen_bowler_types:
    indices = df[df['bowler_type'] == bt].index
    embed_indices = [i for i, idx in enumerate(df.index) if idx in indices and i < len(X_seen_embed)]
    if embed_indices:
        deceptive_embed = X_seen_embed[embed_indices][y_seen[embed_indices] == 1].mean(axis=0)
        non_deceptive_embed = X_seen_embed[embed_indices][y_seen[embed_indices] == 0].mean(axis=0)
        prototypes[bt] = {'deceptive': deceptive_embed, 'non_deceptive': non_deceptive_embed}

# Predict for unseen bowler types using cosine similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

y_pred_zsl = []
for embed in X_unseen_embed:
    max_sim_deceptive = -1
    max_sim_non_deceptive = -1
    for bt in unseen_bowler_types:
        if bt in prototypes:
            sim_deceptive = cosine_similarity(embed, prototypes[bt]['deceptive'])
            sim_non_deceptive = cosine_similarity(embed, prototypes[bt]['non_deceptive'])
            max_sim_deceptive = max(max_sim_deceptive, sim_deceptive)
            max_sim_non_deceptive = max(max_sim_non_deceptive, sim_non_deceptive)
    y_pred_zsl.append(1 if max_sim_deceptive > max_sim_non_deceptive else 0)

y_pred_zsl = np.array(y_pred_zsl)
print("ZSL Accuracy for Unseen Bowler Types (Medium):", accuracy_score(y_unseen, y_pred_zsl))

# 2. Linear Probing with Contrastive Loss
# Custom contrastive loss function
def contrastive_loss(y_true, y_pred, margin=1.0):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

# Build model with contrastive loss
inp1 = Input(shape=(SEQ, len(features)), name='input1')
inp2 = Input(shape=(SEQ, len(features)), name='input2')
label_input = Input(shape=(1,), name='pair_label')  # 1 if same class, 0 if different

# Shared LSTM
lstm = LSTM(64, name='shared_lstm')
x1 = lstm(inp1)
x2 = lstm(inp2)
x1 = Dropout(0.2, name='dropout1')(x1)
x2 = Dropout(0.2, name='dropout2')(x2)

# Compute Euclidean distance between embeddings
distance = Lambda(lambda tensors: K.sqrt(K.sum(K.square(tensors[0] - tensors[1]), axis=1, keepdims=True)),
                  name='distance')([x1, x2])

# Classification branch
x3 = Dense(32, activation='relu', name='dense_layer')(x1)
class_output = Dense(1, activation='sigmoid', name='classification')(x3)

# Model with two inputs and two outputs
model_contrastive = Model(inputs=[inp1, inp2, label_input], outputs=[class_output, distance])
model_contrastive.compile(optimizer=Adam(learning_rate=0.001),
                          loss=['binary_crossentropy', contrastive_loss],
                          loss_weights=[1.0, 0.5])

# Generate pairs for contrastive loss
def generate_pairs(X, y):
    indices = np.random.permutation(len(X))
    X1, X2, pair_labels, y_labels = [], [], [], []
    for i in indices:
        pos_indices = np.where(y == y[i])[0]
        pos_idx = np.random.choice(pos_indices)
        neg_indices = np.where(y != y[i])[0]
        neg_idx = np.random.choice(neg_indices)
        X1.extend([X[i], X[i]])
        X2.extend([X[pos_idx], X[neg_idx]])
        pair_labels.extend([1, 0])
        y_labels.extend([y[i], y[i]])
    X1 = np.array(X1, dtype=np.float32)
    X2 = np.array(X2, dtype=np.float32)
    pair_labels = np.array(pair_labels, dtype=np.float32)
    y_labels = np.array(y_labels, dtype=np.float32)
    print("X1 shape:", X1.shape, "dtype:", X1.dtype)
    print("X2 shape:", X2.shape, "dtype:", X2.dtype)
    print("pair_labels shape:", pair_labels.shape, "dtype:", pair_labels.dtype)
    print("y_labels shape:", y_labels.shape, "dtype:", y_labels.dtype)
    return X1, X2, pair_labels, y_labels

# Create TensorFlow datasets for training and validation with flattened structure
def create_dataset(X, y, batch_size):
    X1, X2, pair_labels, y_labels = generate_pairs(X, y)
    dataset = tf.data.Dataset.from_tensor_slices(
        (
            (X1, X2, pair_labels),  # Inputs as a tuple
            (y_labels, pair_labels)  # Outputs as a tuple
        )
    )
    dataset = dataset.map(
        lambda inputs, outputs: (
            (
                tf.cast(inputs[0], tf.float32),  # X1
                tf.cast(inputs[1], tf.float32),  # X2
                tf.cast(inputs[2], tf.float32)   # pair_labels
            ),
            (
                tf.cast(outputs[0], tf.float32),  # y_labels
                tf.cast(outputs[1], tf.float32)   # pair_labels
            )
        )
    )
    for batch in dataset.take(1):
        print("Dataset inputs dtypes:", [x.dtype for x in batch[0]])
        print("Dataset outputs dtypes:", [x.dtype for x in batch[1]])
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

# Create datasets
batch_size = 128
train_dataset = create_dataset(X_train, y_train, batch_size)
val_dataset = create_dataset(X_test, y_test, batch_size)

# Calculate steps
steps_per_epoch = len(X_train) // batch_size
val_steps = len(X_test) // batch_size

# Train with contrastive loss using the dataset
history = model_contrastive.fit(
    train_dataset,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_dataset,
    validation_steps=val_steps,
    epochs=20,
    verbose=2
)

# Extract improved embeddings using the trained LSTM
feat_ext_contrastive = Model(inputs=inp1, outputs=x1)
Xtr_feat_contrastive = feat_ext_contrastive.predict(X_train, verbose=0)
Xte_feat_contrastive = feat_ext_contrastive.predict(X_test, verbose=0)

# Linear probing on improved embeddings
probe_contrastive = LogisticRegression(class_weight='balanced', max_iter=1000)
probe_contrastive.fit(Xtr_feat_contrastive, y_train)
print("Improved Linear Probing Acc (64D):", probe_contrastive.score(Xte_feat_contrastive, y_test))

# 3. Transfer Learning: Predict Boundary Deliveries
# Simulate a new task: predict if a delivery results in a boundary (4 or 6)
if 'is_boundary' not in df.columns:
    df['is_boundary'] = ((df['ended_x'].abs() > 50) | (df['ended_y'].abs() > 50)).astype(int)

# Build sequences for the new task
Xs_boundary, ys_boundary = [], []
for mid in df['match_id'].unique():
    mdf = df[df['match_id'] == mid]
    for bt in mdf['bowler_type'].unique():
        bdf = mdf[mdf['bowler_type'] == bt]
        for i in range(SEQ, len(bdf)):
            Xs_boundary.append(bdf.iloc[i-SEQ:i][features].values)
            ys_boundary.append(bdf.iloc[i]['is_boundary'])
X_boundary = np.array(Xs_boundary, dtype=np.float32)
y_boundary = np.array(ys_boundary, dtype=np.int32)

# Normalize
for idx in [0, 1, 2, 3, 4, 6]:
    flat_boundary = X_boundary[:, :, idx].reshape(-1, 1)
    X_boundary[:, :, idx] = sc.transform(flat_boundary).reshape(X_boundary.shape[0], SEQ)

# Train/test split for the new task
X_train_boundary, X_test_boundary, y_train_boundary, y_test_boundary = train_test_split(
    X_boundary, y_boundary, test_size=0.2, random_state=42)

# Create a fresh model for transfer learning instead of reusing base_model
boundary_input = Input(shape=(SEQ, len(features)), name='boundary_input')
boundary_lstm = LSTM(64, name='boundary_lstm')(boundary_input)
boundary_dropout = Dropout(0.2, name='boundary_dropout')(boundary_lstm)
boundary_dense = Dense(32, activation='relu', name='boundary_dense')(boundary_dropout)
boundary_dropout2 = Dropout(0.2, name='boundary_dropout2')(boundary_dense)
boundary_out = Dense(1, activation='sigmoid', name='boundary_output')(boundary_dropout2)

# Build and compile the boundary model
boundary_model = Model(inputs=boundary_input, outputs=boundary_out)
boundary_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train on the new task
boundary_model.fit(X_train_boundary, y_train_boundary,
                   epochs=10, batch_size=128,
                   validation_split=0.2, verbose=2)

# Evaluate
y_pred_boundary = (boundary_model.predict(X_test_boundary) > 0.5).astype(int).flatten()
print("Transfer Learning Acc (Boundary Prediction):", accuracy_score(y_test_boundary, y_pred_boundary))

y_train dtype: float32
y_test dtype: float32
X_train dtype: float32
X_test dtype: float32
ZSL Accuracy for Unseen Bowler Types (Medium): 0.8192939531632296
X1 shape: (39206, 5, 7) dtype: float32
X2 shape: (39206, 5, 7) dtype: float32
pair_labels shape: (39206,) dtype: float32
y_labels shape: (39206,) dtype: float32
Dataset inputs dtypes: [tf.float32, tf.float32, tf.float32]
Dataset outputs dtypes: [tf.float32, tf.float32]
X1 shape: (9802, 5, 7) dtype: float32
X2 shape: (9802, 5, 7) dtype: float32
pair_labels shape: (9802,) dtype: float32
y_labels shape: (9802,) dtype: float32
Dataset inputs dtypes: [tf.float32, tf.float32, tf.float32]
Dataset outputs dtypes: [tf.float32, tf.float32]
Epoch 1/20
153/153 - 4s - 25ms/step - classification_loss: 0.4775 - distance_loss: 0.3746 - loss: 0.6648 - val_classification_loss: 0.4281 - val_distance_loss: 0.2799 - val_loss: 0.5681
Epoch 2/20
153/153 - 1s - 8ms/step - classification_loss: 0.4306 - distance_loss: 0.2552 - loss: 0.5582 - val_classificati



153/153 - 0s - 2ms/step - classification_loss: 0.5372 - distance_loss: 0.2471 - loss: 0.6608 - val_classification_loss: 0.4255 - val_distance_loss: 0.2831 - val_loss: 0.5670
Epoch 4/20
153/153 - 1s - 8ms/step - classification_loss: 0.4261 - distance_loss: 0.2514 - loss: 0.5518 - val_classification_loss: 0.4262 - val_distance_loss: 0.2882 - val_loss: 0.5703
Epoch 5/20
153/153 - 1s - 7ms/step - classification_loss: 0.4286 - distance_loss: 0.2523 - loss: 0.5548 - val_classification_loss: 0.4254 - val_distance_loss: 0.2859 - val_loss: 0.5683
Epoch 6/20
153/153 - 0s - 2ms/step - classification_loss: 0.5565 - distance_loss: 0.2520 - loss: 0.6825 - val_classification_loss: 0.4253 - val_distance_loss: 0.2860 - val_loss: 0.5684
Epoch 7/20
153/153 - 5s - 34ms/step - classification_loss: 0.4241 - distance_loss: 0.2504 - loss: 0.5493 - val_classification_loss: 0.4256 - val_distance_loss: 0.2900 - val_loss: 0.5706
Epoch 8/20
153/153 - 2s - 16ms/step - classification_loss: 0.4266 - distance_loss: 0.