In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

df = pd.read_excel(r"/kaggle/input/all-metrics/All_metrics.xlsx")
df.rename(columns={"hist_player_score": "hist_player_performance"}, inplace=True)
df.index = df.game_id
df

In [None]:
df.dtypes

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Drop 'game_id'
df = df.drop('game_id', axis=1)

# Set binary column based on 'first' values
df['first_bot'] = df['first'].isin(["HastyBot", "BetterBot", "STEEBot"]).astype(int)
df = df.drop('first', axis=1)

# One-hot encode categorical columns
categorical_columns = ['time_control_name', 'game_end_reason', 'lexicon', 'rating_mode', 'Bot_nickname']
df = pd.get_dummies(df, columns=categorical_columns)

# Drop 'Player_nickname' and 'created_at'
df = df.drop(['Player_nickname', 'created_at', "Bot_game"], axis=1)

# Move 'Player_rating' to the last column
cols = [col for col in df.columns if col != 'Player_rating'] + ['Player_rating']
df = df[cols]

df

In [None]:
train = df[df["Player_rating"].isnull()==False]
test = df[df["Player_rating"].isnull()==True]

In [None]:
df = train

In [None]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Assuming df is your DataFrame
X = df.drop('Player_rating', axis=1)  # Features
y = df['Player_rating']  # Target variable

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

# Fit the model to the data
rf_model.fit(X, y)

# Get feature importances
feature_importances = rf_model.feature_importances_

# Create a DataFrame to store the results
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the sorted DataFrame
print(feature_importance_df)

In [None]:
selected_features = feature_importance_df[feature_importance_df['Importance'] > 0.005]

# Sort the DataFrame by importance in descending order
selected_features = selected_features.sort_values(by='Importance', ascending=False)

# Print the sorted DataFrame
selected_features

In [None]:
# Assuming df is your original DataFrame
selected_feature_names = selected_features['Feature'].tolist()

# Select columns with the important features
df_selected = df[selected_feature_names + ['Player_rating']]

# Display the selected DataFrame
df_selected.head()

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming df_selected is your DataFrame
X = df_selected.drop('Player_rating', axis=1)
y = df_selected['Player_rating']

# Create a pipeline with LightGBM
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lgbm', LGBMRegressor())
])

# Define parameter grid for grid search
param_grid = {
    'lgbm__n_estimators': [50, 100, 200],
    'lgbm__max_depth': [6, 8, 12, 15],
    'lgbm__learning_rate': [0.01, 0.1, 0.2],
    'lgbm__min_child_samples': [5, 10, 20],
    'lgbm__subsample': [0.8, 0.9, 1.0],
    'lgbm__colsample_bytree': [0.8, 0.9, 1.0]
}

# Define RMSE as the scoring metric for GridSearchCV
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(np.mean((y_true - y_pred)**2)))

# Initialize GridSearchCV with verbose set to a positive integer
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring=rmse_scorer, n_jobs=-1, verbose=2)

# Fit the model using partial_fit to print live progress
grid_search.fit(X, y)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_
print("\nBest Model:", best_model)

# Get cross-validation scores
print("\nRunning cross-validation. This may take some time...")
kf = KFold(n_splits=10, shuffle=True, random_state=42)  # Specify the same number of splits as the GridSearchCV
fold = 1
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    fold_rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    print(f"Fold {fold}: RMSE = {fold_rmse}")
    fold += 1


In [None]:
best_model

In [None]:
test_df_selected = test[selected_feature_names + ['Player_rating']]

# Display the selected DataFrame
test_df_selected.head()
# Assuming df_selected is your DataFrame
test_X = test_df_selected.drop('Player_rating', axis=1)
test_y = test_df_selected['Player_rating']

test_X["Predictions"] = best_model.predict(test_X)
test_X["game_id"] = test_X.index
test_X

In [None]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
file_name = f"final_test_results_{timestamp}.csv"



final_test_results = test_X[["game_id", "Predictions"]]
final_test_results.rename(columns={"Predictions": "rating"}, inplace=True)
final_test_results_reset = final_test_results.reset_index(drop=True)
final_test_results_sorted = final_test_results_reset.sort_values(by='game_id', ascending=True)
final_test_results_sorted.to_csv(file_name, index=False)
