In [14]:
import numpy as np 
import pandas as pd 
import xgboost as xgb 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error 
import warnings 
warnings.filterwarnings('ignore') 

In [15]:
x_train=pd.read_csv('x_train.csv') 
y_train=pd.read_csv('y_train.csv') 
x_test=x_train.iloc[45000:,:].to_numpy() 
y_test=y_train.iloc[45000:].to_numpy() 
x_train=x_train.iloc[:45000,:].to_numpy() 
y_train=y_train.iloc[:45000].to_numpy() 

In [18]:
x_train = x_train.reshape(x_train.shape[0], -1)
dtrain = xgb.DMatrix(x_train, label=y_train)

params = {
    'objective': 'reg:squarederror',  # Regression objective for continuous target
    'eval_metric': 'rmse',             # Mean Absolute Error, suitable for regression
    'max_depth': 5,                   # Slightly deeper tree depth to capture more complexity
    'learning_rate': 0.01,            # Lower learning rate for better generalization
    'n_estimators': 250,              # Increase to allow the model to learn more from data
    'subsample': 0.6,                 # Use 60% of data per tree for better generalization
    'colsample_bytree': 0.8,          # Use 80% of features per tree
    'reg_alpha': 1,                 # L1 regularization to reduce overfitting
    'reg_lambda': 1                  # L2 regularization to control complexity
}  


watchlist = [(dtrain, 'train')]


num_round = 1000 
model = xgb.train(params, dtrain, num_round)

In [19]:
x_test = x_test.reshape(x_test.shape[0], -1)
dtest = xgb.DMatrix(x_test)
y_pred = model.predict(dtest)

In [48]:
import numpy as np

def process_chunks(y_pred, y_test, chunk_size=22):
    length = (len(y_pred) // chunk_size) * chunk_size  # Exclude the remainder
    y_pred_chunks = [y_pred[i:i + chunk_size] for i in range(0, length, chunk_size)]
    y_test_chunks = [y_test[i:i + chunk_size] for i in range(0, length, chunk_size)]

    top_11_indices_pred = []
    sorted_y_test_chunks = []  
    sorted_y_pred_chunks = [] 
    for chunk in y_pred_chunks:
        sorted_indices = np.argsort(chunk)[::-1] 
        top_11 = sorted_indices[:11]
        top_11_indices_pred.append(top_11)
    for chunk, indices in zip(y_test_chunks, top_11_indices_pred):
        sorted_chunk = np.zeros_like(chunk)
        sorted_chunk[:len(indices)] = chunk[indices]  # Assign top 11 values
        remaining_indices = [i for i in range(chunk_size) if i not in indices]
        sorted_chunk[len(indices):] = chunk[remaining_indices]  # Assign rest
        sorted_y_pred_chunks.append(sorted_chunk)  

    top_11_indices_test = []
    for chunk in y_test_chunks:
        sorted_indices = np.argsort(chunk)[::-1]  # Sort actual values in descending order
        top_11 = sorted_indices[:11]
        top_11_indices_test.append(top_11)
        sorted_y_test_chunks.append(chunk[sorted_indices])  # Sort the y_test values correctly
    
    return sorted_y_test_chunks, sorted_y_pred_chunks, top_11_indices_pred
y_test=y_test.flatten() 
y_pred=y_pred.flatten()
sorted_y_test_chunks, sorted_y_pred_chunks, top_11_indices_pred = process_chunks(y_pred, y_test)
mae_list = []
sum1_list = []

for i, (sorted_y_pred_chunk, sorted_y_test_chunk) in enumerate(zip(sorted_y_pred_chunks, sorted_y_test_chunks)):
    sum1 = sorted_y_test_chunk[0] * 2 + sorted_y_test_chunk[1] * 1.5 + np.sum(sorted_y_test_chunk[2:11]) 
    sum2 = sorted_y_pred_chunk[0] * 2 + sorted_y_pred_chunk[1] * 1.5 + np.sum(sorted_y_pred_chunk[2:11]) 

    error = np.abs(sum1 - sum2) 
    mae_list.append(error)
    sum1_list.append(sum1)

    print(f"Chunk {i+1} Error: {error}, True Sum: {sum1}, Predicted Sum: {sum2}")
print("Mean Absolute Error (MAE):", np.mean(np.array(mae_list)))
print("Average Dream Team Points in ODI:", np.mean(np.array(sum1_list)))


Chunk 1 Error: 378.5, True Sum: 1232.0, Predicted Sum: 853.5
Chunk 2 Error: 612.0, True Sum: 1112.0, Predicted Sum: 500.0
Chunk 3 Error: 549.5, True Sum: 1146.5, Predicted Sum: 597.0
Chunk 4 Error: 438.0, True Sum: 939.5, Predicted Sum: 501.5
Chunk 5 Error: 252.0, True Sum: 886.0, Predicted Sum: 634.0
Chunk 6 Error: 476.5, True Sum: 1076.0, Predicted Sum: 599.5
Chunk 7 Error: 399.0, True Sum: 1052.5, Predicted Sum: 653.5
Chunk 8 Error: 399.5, True Sum: 1218.5, Predicted Sum: 819.0
Chunk 9 Error: 387.0, True Sum: 745.0, Predicted Sum: 358.0
Chunk 10 Error: 170.5, True Sum: 691.5, Predicted Sum: 521.0
Chunk 11 Error: 273.5, True Sum: 1030.0, Predicted Sum: 756.5
Chunk 12 Error: 530.0, True Sum: 921.5, Predicted Sum: 391.5
Chunk 13 Error: 217.5, True Sum: 681.5, Predicted Sum: 464.0
Chunk 14 Error: 612.5, True Sum: 936.0, Predicted Sum: 323.5
Chunk 15 Error: 315.0, True Sum: 1065.0, Predicted Sum: 750.0
Chunk 16 Error: 340.0, True Sum: 1082.0, Predicted Sum: 742.0
Chunk 17 Error: 336.0, T

In [53]:
import pickle
pickle.dump(model, open("xgb_model.pkl", "wb"))

In [1]:
import os
import json
import pandas as pd

# Define the folder names and their corresponding csv folder mapping
folder_mapping = {
    'data1_json': 'csv1',
    'data2_json': 'csv2',
    'data3_json': 'csv3'
}

# Initialize a list to store the results
rows = []

# Base directory (adjust if needed)
base_dir = os.getcwd()  # or specify the directory where data folders are located

# Iterate through each folder in the mapping
for folder, csv_folder in folder_mapping.items():
    folder_path = os.path.join(base_dir, folder)
    # List all files that end with .json
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_full_path = os.path.join(folder_path, filename)
            # Open and load the JSON data
            with open(file_full_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Extract teams and date. 
            # Assuming teams is a list under info -> teams and date is the first element in info -> dates.
            teams = data.get('info', {}).get('teams', [])
            date_list = data.get('info', {}).get('dates', [])
            
            # Assign team1 and team2 based on the order in the list (if available)
            team1 = teams[0] if len(teams) > 0 else None
            team2 = teams[1] if len(teams) > 1 else None
            
            # Get the first date from the dates list
            date = date_list[0] if date_list else None
            
            # Create the new file path as described: dt/{csv_folder}/{filename with .csv extension}
            base_file, _ = os.path.splitext(filename)
            new_filename = base_file + '.csv'
            new_file_path = os.path.join('dt', csv_folder, new_filename)
            
            # Append the row details to the list
            rows.append({
                'file_path': new_file_path,
                'team1': team1,
                'team2': team2,
                'date': date
            })

# Create DataFrame
df = pd.DataFrame(rows)
print(df)

# Optionally, save the dataframe to a CSV file
df.to_csv('combined_data2.csv', index=False)


                file_path                 team1                 team2  \
0       dt\csv1\64814.csv           New Zealand                 India   
1       dt\csv1\64815.csv                 India           New Zealand   
2       dt\csv1\64816.csv                 India           New Zealand   
3       dt\csv1\64817.csv           New Zealand                 India   
4       dt\csv1\64819.csv                 India           New Zealand   
...                   ...                   ...                   ...   
2385  dt\csv3\1457465.csv  United Arab Emirates           Netherlands   
2386  dt\csv3\1457466.csv           Netherlands                  Oman   
2387  dt\csv3\1457467.csv  United Arab Emirates                  Oman   
2388  dt\csv3\1457468.csv           Netherlands  United Arab Emirates   
2389  dt\csv3\1457469.csv                  Oman           Netherlands   

            date  
0     2002-12-29  
1     2003-01-01  
2     2003-01-04  
3     2003-01-08  
4     2003-01-14  
...      