In [7]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# 1. Load Data
# Use the correct path to your CSV file
try:
    df = pd.read_csv('/content/IPL.csv', low_memory=False)
except:
    df = pd.read_csv('/content/IPL.csv', low_memory=False)

# 2. Feature Engineering
# Get target scores from 1st innings
total_score_df = df.groupby(['match_id', 'innings'])['runs_total'].sum().reset_index()
first_innings_score = total_score_df[total_score_df['innings'] == 1]
first_innings_score = first_innings_score.rename(columns={'runs_total': 'target_score'})
first_innings_score['target_score'] = first_innings_score['target_score'] + 1

# Filter for 2nd innings and merge target
match_df = df[df['innings'] == 2].copy()
match_df = match_df.merge(first_innings_score[['match_id', 'target_score']], on='match_id')

# Cumulative stats
match_df['current_score'] = match_df.groupby('match_id')['runs_total'].cumsum()
match_df['balls_bowled'] = match_df.groupby('match_id').cumcount() + 1
match_df['balls_left'] = 120 - match_df['balls_bowled']
match_df['runs_left'] = match_df['target_score'] - match_df['current_score']

# Wickets left
match_df['is_wicket'] = match_df['wicket_kind'].apply(lambda x: 1 if pd.notnull(x) else 0)
match_df['wickets_lost'] = match_df.groupby('match_id')['is_wicket'].cumsum()
match_df['wickets_left'] = 10 - match_df['wickets_lost']

# Run Rates
match_df['crr'] = (match_df['current_score'] * 6) / match_df['balls_bowled']
match_df['rrr'] = match_df.apply(lambda x: (x['runs_left'] * 6) / x['balls_left'] if x['balls_left'] > 0 else 0, axis=1)

# Result
match_df['result'] = (match_df['batting_team'] == match_df['match_won_by']).astype(int)

# Final selection
final_df = match_df[['batting_team', 'bowling_team', 'city', 'runs_left', 'balls_left',
                    'wickets_left', 'target_score', 'crr', 'rrr', 'result']].dropna()
final_df = final_df[final_df['balls_left'] > 0]

X = final_df.drop('result', axis=1)
y = final_df['result']

# Ensure we have two classes for Logistic Regression (handling small datasets)
if len(y.unique()) < 2:
    dummy_row = X.iloc[0:1].copy()
    X = pd.concat([X, dummy_row], ignore_index=True)
    y = pd.concat([y, pd.Series([1 - y.iloc[0]])], ignore_index=True)

# 3. Build Pipeline
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'),
     ['batting_team', 'bowling_team', 'city'])
], remainder='passthrough')

pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', LogisticRegression(solver='liblinear'))
])

pipe.fit(X, y)

# 4. Export Files
joblib.dump(pipe, 'pipe.joblib')

teams = sorted(list(set(final_df['batting_team'].unique()) | set(final_df['bowling_team'].unique())))
joblib.dump(teams, 'teams.joblib')

cities = sorted(final_df['city'].unique().tolist())
joblib.dump(cities, 'cities.joblib')

print("Files 'pipe.joblib', 'teams.joblib', and 'cities.joblib' created successfully.")

Files 'pipe.joblib', 'teams.joblib', and 'cities.joblib' created successfully.
