In [2]:
import numpy as np 
import pandas as pd
import os  
import warnings
warnings.filterwarnings('ignore') 
import ast 
from datetime import timedelta

In [6]:
player_vs_player_stats=pd.read_csv('dt/player_vs_player_stats.csv') 
player_stats_with_date_venue=pd.read_csv('dt/odi_player_stats_with_date_venue.csv') 
all_matches_player=pd.read_csv('dt/matches_all_players.csv') 
fantasy_points=pd.read_csv('fantasy_points_data.csv') 
venues_with_dates_with_locations_with_weather=pd.read_csv('dt/odis_venues_with_dates_with_locations_with_weather.csv')

In [65]:
def get_player_venue_stats(df, player_name, venue, date, delta): 
    import numpy as np 
    import pandas as pd 
    from datetime import timedelta
    df['Date'] = pd.to_datetime(df['Date'])
    end_date = pd.to_datetime(date)
    start_date = end_date - timedelta(days=delta)
    filtered_df = df[
        (df['player_name'] == player_name) & 
        (df['Venue'] == venue) & 
        (df['Date'] >= start_date) & 
        (df['Date'] < end_date)
    ]
    if filtered_df.empty:
        return pd.DataFrame([{
            'runs_scored': 0,
            'balls_faced': 0,
            'wickets_taken': 0,
            'runs_given': 0,
            'balls_thrown': 0,
            'boundaries_scored': 0,
            'boundaries_given': 0,
            'number_of_dismissals': 0,
            'strike_rate': 0,
            'economy': 0,
            'batting_average': 0,
            'fantasy_points': 0,
            'number_of_matches_played': 0
        }])
    aggregated_stats = {
        'Date': date,
        'Venue': venue,
        'player_Id': filtered_df['player_Id'].iloc[0],  # Assuming player_Id is unique for a player
        'player_name': player_name,
        'runs_scored': filtered_df['runs_scored'].sum(),
        'balls_faced': filtered_df['balls_faced'].sum(),
        'wickets_taken': filtered_df['wickets_taken'].sum(),
        'runs_given': filtered_df['runs_given'].sum(),
        'balls_thrown': filtered_df['balls_thrown'].sum(),
        'boundaries_scored': filtered_df['boundaries_scored'].sum(),
        'boundaries_given': filtered_df['boundaries_given'].sum(),
        'number_of_dismissals': filtered_df['number_of_dismissals'].sum(),
        'strike_rate': (filtered_df['runs_scored'].sum() / (filtered_df['balls_faced'].sum() if filtered_df['balls_faced'].sum() != 0 else 1)),
        'economy': ((filtered_df['runs_given'].sum() * 6) / (filtered_df['balls_thrown'].sum() if filtered_df['balls_thrown'].sum() != 0 else 1)),
        'batting_average': (filtered_df['runs_scored'].sum() / (filtered_df['number_of_dismissals'].sum() if filtered_df['number_of_dismissals'].sum() != 0 else 1)),
        'fantasy_points': filtered_df['fantasy_points'].sum(),
        'number_of_matches_played': filtered_df.shape[0]  # Number of matches played in the interval
    } 
    a=pd.DataFrame([aggregated_stats]) 
    a=a.drop(['Date','Venue','player_Id','player_name'],axis=1)
    return a 
def get_player_vs_player_stats_ordered(df, player1_name, player2_names, date, delta): 
    import numpy as np 
    import pandas as pd 
    from datetime import timedelta
    df['match_date'] = pd.to_datetime(df['match_date'])
    end_date = pd.to_datetime(date)
    start_date = end_date - timedelta(days=delta)
    filtered_df = df[
        (df['match_date'] >= start_date) & 
        (df['match_date'] < end_date)
    ]
    direct_matches = filtered_df[
        (filtered_df['player1_name'] == player1_name) & 
        (filtered_df['player2_name'].isin(player2_names))
    ]
    
    reverse_matches = filtered_df[
        (filtered_df['player2_name'] == player1_name) & 
        (filtered_df['player1_name'].isin(player2_names))
    ]
    reverse_matches = reverse_matches.rename(columns={
        'player1_id': 'player2_id', 'player1_name': 'player2_name',
        'player2_id': 'player1_id', 'player2_name': 'player1_name',
        'runs_b1_b2': 'runs_b2_b1', 'balls_b1_b2': 'balls_b2_b1',
        'boundaries_b1_b2': 'boundaries_b2_b1', 'dismissals_b1_b2': 'dismissals_b2_b1',
        'runs_b2_b1': 'runs_b1_b2', 'balls_b2_b1': 'balls_b1_b2',
        'boundaries_b2_b1': 'boundaries_b1_b2', 'dismissals_b2_b1': 'dismissals_b1_b2',
        'strike_rate_b1_b2': 'strike_rate_b2_b1', 'strike_rate_b2_b1': 'strike_rate_b1_b2',
        'economy_b1_b2': 'economy_b2_b1', 'economy_b2_b1': 'economy_b1_b2',
        'fantasy_point_p1_p2': 'fantasy_point_p2_p1', 'fantasy_point_p2_p1': 'fantasy_point_p1_p2'
    })
    combined_df = pd.concat([direct_matches, reverse_matches], ignore_index=True)
    aggregated_df = combined_df.groupby(['player1_id', 'player1_name', 'player2_id', 'player2_name']).agg({
    'runs_b1_b2': 'sum',
    'balls_b1_b2': 'sum',
    'boundaries_b1_b2': 'sum',
    'dismissals_b1_b2': 'sum',
    'fantasy_point_p1_p2': 'sum',
    'match_date': 'count' 
          }).reset_index()
    aggregated_df['strike_rate_b1_b2'] = aggregated_df['runs_b1_b2'] / aggregated_df['balls_b1_b2'].replace(0, 1)
    aggregated_df['economy_b1_b2'] = (aggregated_df['runs_b1_b2']*6) / (aggregated_df['balls_b1_b2'] ).replace(0, 1)
    aggregated_df.rename(columns={'match_date': 'number_of_matches_played'}, inplace=True)
    result_rows = []
    for player2_name in player2_names:
        row = aggregated_df[aggregated_df['player2_name'] == player2_name]
        if not row.empty: 
            row=row.drop(['player1_id','player1_name','player2_id','player2_name'],axis=1)
            result_rows.append(row.iloc[0].to_dict())  # Add the existing row
        else:
            result_rows.append({
                'runs_b1_b2': 0,
                'balls_b1_b2': 0,
                'boundaries_b1_b2': 0,
                'dismissals_b1_b2': 0,
                'strike_rate_b1_b2': 0,
                'economy_b1_b2': 0,
                'fantasy_point_p1_p2': 0,
                'number_of_matches_played': 0
            })
    
    # Convert result_rows to a DataFrame
    result_df = pd.DataFrame(result_rows) 
    return result_df 
def get_player_matchwise_stats(df, player, date, delta): 
    import numpy as np 
    import pandas as pd 
    from datetime import timedelta
    df['Date'] = pd.to_datetime(df['Date'])
    end_date = pd.to_datetime(date)
    start_date = end_date - timedelta(days=delta)
    filtered_df = df[
        (df['Player'] == player) &
        (df['Date'] >= start_date) &
        (df['Date'] < end_date)
    ]
    if filtered_df.empty:
        return pd.DataFrame([{
            'EWMA Fantasy Points': 0,
            'total_points': 0,
            'Runs': 0,
            'Wickets': 0,
            'Balls_Faced': 0,
            'Strike_Rate': 0,
            'matches_played': 0, 
            'Runs_Given': 0,  
            'Balls_Thrown': 0,  
            'Boundaries_Scored': 0,  
            'Boundaries_Given': 0,  
            'Number_of_Dismissals': 0,  
            'Economy': 0, 
            'Batting_Average': 0
        }])
    filtered_df = filtered_df.sort_values(by='Date')
    filtered_df['EWMA Fantasy Points'] = filtered_df['EWMA Fantasy Points']
    filtered_df['EWMA Fantasy Points'].fillna(0, inplace=True)
    aggregated_stats = {
        'EWMA Fantasy Points': filtered_df['EWMA Fantasy Points'].iloc[-1],  
        'total_points': filtered_df['total_points'].sum(),
        'Runs': filtered_df['Runs_Scored'].sum(),
        'Wickets': filtered_df['Wickets_Taken'].sum(),
        'Balls_Faced': filtered_df['Balls_Faced'].sum(),
        'Strike_Rate': (filtered_df['Runs_Scored'].sum() / (filtered_df['Balls_Faced'].sum() if filtered_df['Balls_Faced'].sum() != 0 else 1)),
        'matches_played': len(filtered_df), 
        'Runs_Given': filtered_df['Runs_Given'].sum(), 
        'Balls_Thrown': filtered_df['Balls_Thrown'].sum(), 
        'Boundaries_Scored': filtered_df['Boundaries_Scored'].sum(), 
        'Boundaries_Given': filtered_df['Boundaries_Given'].sum(), 
        'Number_of_Dismissals': filtered_df['Number_of_Dismissals'].sum(),  
        'Economy':((filtered_df['Runs_Given'].sum()*6) / (filtered_df['Balls_Thrown'].sum() if filtered_df['Balls_Thrown'].sum() != 0 else 1)),
        'Batting_Average': (filtered_df['Runs_Scored'].sum() / (filtered_df['Number_of_Dismissals'].sum() if filtered_df['Number_of_Dismissals'].sum() != 0 else 1)),
    }
    a = pd.DataFrame([aggregated_stats]) 
    return a 
counter=0
def stack_and_pad_dataframes(df1, df2, df3, df4,x_train=x_train):
    global counter
    current_rows, num_columns = df1.shape
    required_rows=12
    if current_rows < required_rows:
        missing_rows = required_rows - current_rows
        zero_padding = pd.DataFrame(
            0, 
            index=range(missing_rows), 
            columns=df1.columns
        )
        padded_df = pd.concat([df1, zero_padding], ignore_index=True)
    elif current_rows > required_rows:
        padded_df = df1.iloc[:required_rows, :]
    else:
        padded_df = df1
    flattened_array = padded_df.to_numpy().flatten()
    reshaped_array = flattened_array.reshape(-1, 1)
    flattened_array2 = df2.to_numpy().flatten()
    flattened_array3 = df3.to_numpy().flatten()
    flattened_array4 = df4.to_numpy().flatten()
    stacked_array = np.concatenate([ reshaped_array.reshape(1, -1), 
                                    flattened_array2.reshape(1, -1),
                                    flattened_array3.reshape(1, -1),
                                    flattened_array4.reshape(1, -1)], axis=1)
    final_array = stacked_array.reshape(1, -1)
    x_train[counter] = final_array
    counter += 1 
def get_fantasy_points(df, player_name, match_date):
    result = df[(df['Player Name'] == player_name) & (df['Match Date'] == match_date)]
    if not result.empty:
        return result['Fantasy Points'].values[0]
    else:
        return 0 
def get_weather_data(dataframe, date, venue): 
    import numpy as np 
    import pandas as pd 
    from datetime import timedelta
    match = dataframe[(dataframe['start_date'] == date) & (dataframe['venue'] == venue)]
    if not match.empty:
        match=match.drop(['start_date','venue','latitude','longitude'],axis=1)
        return match
    else:
        zero_row = pd.DataFrame([{
            'temperature': 0,
            'precipitation': 0,
            'wind_speed': 0
        }])
        return zero_row

In [31]:
x_train=np.zeros((70000,126)) 
y_train=np.zeros(70000)

In [32]:
data1='dt/csv1'  
data2='dt/csv2' 
data3='dt/csv3' 
data_1=os.listdir(data1) 
data_2=os.listdir(data2) 
data_3=os.listdir(data3)  
i=0
for data in data_1: 
    path2=os.path.join(data1,data) 
    data=pd.read_csv(path2)   
    team1_players=data['team1_players'].apply(ast.literal_eval)[0]
    team2_players=data['team2_players'].apply(ast.literal_eval)[0] 
    date=data['date'][0] 
    venue=data['venue'][0]  
    for player in team1_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team2_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e   
        i=i+1 
    for player in team2_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team1_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e  
        i=i+1 
for data in data_2: 
    path2=os.path.join(data2,data) 
    data=pd.read_csv(path2)   
    team1_players=data['team1_players'].apply(ast.literal_eval)[0]
    team2_players=data['team2_players'].apply(ast.literal_eval)[0] 
    date=data['date'][0] 
    venue=data['venue'][0]  
    for player in team1_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team2_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e   
        i=i+1 
    for player in team2_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team1_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e  
        i=i+1 
for data in data_3: 
    path2=os.path.join(data3,data) 
    data=pd.read_csv(path2)   
    team1_players=data['team1_players'].apply(ast.literal_eval)[0]
    team2_players=data['team2_players'].apply(ast.literal_eval)[0] 
    date=data['date'][0] 
    venue=data['venue'][0]  
    for player in team1_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team2_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e   
        i=i+1 
    for player in team2_players:  
        a=get_player_venue_stats(player_stats_with_date_venue,player,venue,date,3000) 
        b=get_player_matchwise_stats(all_matches_player,player,date,180) 
        c=get_player_vs_player_stats_ordered(player_vs_player_stats,player,team1_players,date,800) 
        d=get_weather_data(venues_with_dates_with_locations_with_weather,date,venue) 
        stack_and_pad_dataframes(c,a,b,d) 
        e=get_fantasy_points(fantasy_points,player,date) 
        y_train[i]=e  
        i=i+1 

In [57]:
x_train2=x_train[:52000,:] 
y_train2=y_train[:52000]  
x_train2=pd.DataFrame(x_train2) 
y_train2=pd.DataFrame(y_train2) 
x_train2.to_csv('x_train.csv',index=False) 
y_train2.to_csv('y_train.csv',index=False)

In [56]:
y_train

array([ 7., 85., 86., ...,  0.,  0.,  0.])

In [67]:
import dill

with open("get_player_venue_stats.pkl", "wb") as f:
    dill.dump(get_player_venue_stats, f) 
with open("get_player_matchwise_stats.pkl", "wb") as f:
    dill.dump(get_player_matchwise_stats, f) 
with open("get_player_vs_player_stats_ordered.pkl", "wb") as f:
    dill.dump(get_player_vs_player_stats_ordered, f) 
with open("get_weather_data.pkl", "wb") as f:
    dill.dump(get_weather_data, f) 
with open("stack_and_pad_dataframes.pkl", "wb") as f:
    dill.dump(stack_and_pad_dataframes, f) 
with open("get_fantasy_points.pkl", "wb") as f:
    dill.dump(get_fantasy_points, f)  