In [None]:
import pandas as pd
import numpy as np

In [None]:
games = pd.read_csv("games.csv")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
turns = pd.read_csv("turns.csv")

In [None]:
turns.head()

Unnamed: 0,game_id,turn_number,nickname,rack,location,move,points,score,turn_type
0,1,1,BetterBot,DDEGITT,8G,DIG,10,10,Play
1,1,2,stevy,AEHOPUX,7H,HAP,18,18,Play
2,1,3,BetterBot,DEELTTU,6I,LUTE,16,26,Play
3,1,4,stevy,EMORSUX,5K,UM,16,34,Play
4,1,5,BetterBot,ACDEITU,L5,..DICATE,28,54,Play


# Turns

### Words & Letters

In [None]:
import re
# Len of moves (remove dots)
turns['move_clean'] = turns['move'].astype(str).apply(lambda x: re.sub(r'[^a-zA-Z]', '', x))
turns['move_clean'] = turns['move_clean'].replace('.','')
turns['move_len'] = turns['move_clean'].apply(len)

In [None]:
# Len of rack
turns['rack_len'] = turns['rack'].str.len()

In [None]:
# difficult words
import textstat

turns["move"].fillna("None",inplace=True)
turns["difficult_word"] = turns["move"].apply(textstat.difficult_words)

In [None]:
# difficulty letters
difficult_letters = ["K", "J", "X", "Q", "Z"]
medium_letters = ["B", "C", "M", "P", "F", "H", "V", "W", "Y"]
easy_letters = ["A", "E", "I", "L", "N", "O", "R", "S", "T", "U", "D", "G"]

turns["difficult_letters"] = turns["move_clean"].apply(lambda x: len([letter for letter in x if letter in difficult_letters]))
turns["medium_letters"] = turns["move_clean"].apply(lambda x: len([letter for letter in x if letter in medium_letters]))
turns["easy_letters"] = turns["move_clean"].apply(lambda x: len([letter for letter in x if letter in easy_letters]))

In [None]:
# blank tiles = 0 points
turns["blank_used"] = turns["move_clean"].apply(lambda x: sum(1 for letter in x if letter.islower()))

In [None]:
# bingo = extra 50 points
turns["is_bingo"] = turns["move_len"].apply(lambda x: 1 if x==7 else 0)

### location bonus

In [None]:
def location_bonus(location):
    bonus = 0
    if location in triple_word_score_lo:
        bonus = 4
    elif location in double_word_socre_lo:
        bonus = 3
    elif location in triple_letter_score_lo:
        bonus = 2
    elif location in double_letter_score_lo:
        bonus = 1
    return bonus

triple_word_score_lo = ['1A','3H','1O']
double_word_socre_lo = ['2B','3C','4D','5E','8H','5K','4L','3M','2O','14B','13C','12D','11E','11K','12L','13M','14N']
triple_letter_score_lo = ['2F','2J','6B','6F','6J','6N','10B','10F','10J','10N','14F','14J']
double_letter_score_lo = ['1D','1L','3G','3I','4A','4H','4O','7C','7G','7I','7M',\
                          '8D','8L','9C','9G','9I','9M','12A','12H','12O','13G','13I',\
                          '15D','15L']

turns['bonus'] = turns["location"].apply(location_bonus)

### Turn Type

In [None]:
# Group by 'game_id' and 'nickname', and then 'turn_type', and count the occurrences
turn_type_counts = turns.groupby(['game_id', 'nickname', 'turn_type']).size().reset_index(name='count')
# Pivot the table to get turn types as columns and their counts as values
pivot_table = turn_type_counts.pivot_table(index=['game_id', 'nickname'], columns='turn_type', values='count', fill_value=0)
# Reset the index to turn the MultiIndex into columns
pivot_table.reset_index(inplace=True)
# Flatten the columns after pivot
pivot_table.columns = [''.join(col).strip() if col[1] else col[0] for col in pivot_table.columns.values]
pivot_table.head()

Unnamed: 0,game_id,nickname,Challenge,End,Exchange,Pass,Play,Six-Zero Rule,Timeout
0,1,BetterBot,0,0,0,0,13,0,0
1,1,stevy,0,1,0,0,13,0,0
2,2,BetterBot,0,0,0,0,12,0,0
3,2,Super,0,1,1,1,11,0,0
4,3,BetterBot,0,0,1,0,12,0,0


### turns_agg

In [None]:
# agg turns data
turns_agg = turns.groupby(["game_id", "nickname"]).agg(
    total_score=('score', 'last'),
    avg_point=('points', 'mean'),
    max_point=('points', 'max'),
    avg_move_len=('move_len', 'mean'),
    max_move_len=('move_len', 'max'),
    difficult_words=('difficult_word', 'sum'),
    difficult_letters=('difficult_letters', 'sum'),
    medium_letters=('medium_letters', 'sum'),
    easy_letters=('easy_letters', 'sum'),
    blank_used=('blank_used', 'sum'),
    bingo_cnt=('is_bingo', 'sum'),
    total_bonus=('bonus', 'sum'),
    avg_bonus=('bonus', 'mean')
)
turns_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_score,avg_point,max_point,avg_move_len,max_move_len,difficult_words,difficult_letters,medium_letters,easy_letters,blank_used,bingo_cnt,total_bonus,avg_bonus
game_id,nickname,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,BetterBot,335,25.769231,68,4.0,7,3,3,9,40,0,1,3,0.230769
1,stevy,429,30.642857,98,3.428571,7,1,2,9,35,2,2,8,0.571429
2,BetterBot,401,33.416667,85,4.25,7,3,3,6,41,1,3,6,0.5
2,Super,488,34.857143,94,3.857143,7,5,2,13,38,1,3,2,0.142857
3,BetterBot,318,24.461538,76,3.692308,7,2,3,11,33,1,1,7,0.538462


# Merge

In [None]:
df = train.merge(turns_agg, how="left", on=["game_id", "nickname"])
test_df = test.merge(turns_agg, how="left", on=["game_id", "nickname"])
df.head()

Unnamed: 0,game_id,nickname,score,rating,total_score,avg_point,max_point,avg_move_len,max_move_len,difficult_words,difficult_letters,medium_letters,easy_letters,blank_used,bingo_cnt,total_bonus,avg_bonus
0,1,BetterBot,335,1637,335,25.769231,68,4.0,7,3,3,9,40,0,1,3,0.230769
1,1,stevy,429,1500,429,30.642857,98,3.428571,7,1,2,9,35,2,2,8,0.571429
2,3,davidavid,440,1811,440,31.428571,103,3.857143,7,2,4,7,42,1,1,3,0.214286
3,3,BetterBot,318,2071,318,24.461538,76,3.692308,7,2,3,11,33,1,1,7,0.538462
4,4,Inandoutworker,119,1473,119,8.5,26,1.857143,6,1,0,9,17,0,0,11,0.785714


In [None]:
test_df

Unnamed: 0,game_id,nickname,score,rating,total_score,avg_point,max_point,avg_move_len,max_move_len,difficult_words,difficult_letters,medium_letters,easy_letters,blank_used,bingo_cnt,total_bonus,avg_bonus
0,2,Super,488,,488,34.857143,94,3.857143,7,5,2,13,38,1,3,2,0.142857
1,2,BetterBot,401,2000.0,401,33.416667,85,4.250000,7,3,3,6,41,1,3,6,0.500000
2,7,STEEBot,377,2082.0,377,26.928571,62,4.142857,7,2,3,6,48,1,2,4,0.285714
3,7,Priya1,379,,379,29.153846,68,3.384615,7,1,2,12,29,1,1,5,0.384615
4,11,STEEBot,334,1829.0,334,22.266667,76,3.400000,7,2,0,9,41,1,1,1,0.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44721,72762,kyjo55555,367,,367,26.214286,67,3.357143,7,2,3,8,34,2,1,14,1.000000
44722,72768,HastyBot,524,2356.0,524,47.636364,82,4.727273,7,0,4,10,36,2,4,0,0.000000
44723,72768,Maximilian,357,,357,29.750000,83,4.000000,7,1,1,8,39,0,2,2,0.166667
44724,72769,STEEBot,626,2110.0,626,48.153846,194,4.153846,7,5,4,7,43,0,3,4,0.307692


In [None]:
# Create bot df
bot_names =["BetterBot", "STEEBot", "HastyBot"]
bot_df = df[["game_id", "nickname", "score", "rating"]].copy()
bot_df['bot_name'] = bot_df['nickname'].apply(lambda x: x if x in bot_names else np.nan)
bot_df = bot_df[["game_id", "score", "rating", "bot_name"]].dropna(subset=["bot_name"])
bot_df.columns = ["game_id", "bot_score", "bot_rating", "bot_name"]
bot_df.head()

test_bot_df = test_df[["game_id", "nickname", "score", "rating"]].copy()
test_bot_df['bot_name'] = test_bot_df['nickname'].apply(lambda x: x if x in bot_names else np.nan)
test_bot_df = test_bot_df[["game_id", "score", "rating", "bot_name"]].dropna(subset=["bot_name"])
test_bot_df.columns = ["game_id", "bot_score", "bot_rating", "bot_name"]
test_bot_df.head()

Unnamed: 0,game_id,bot_score,bot_rating,bot_name
1,2,401,2000.0,BetterBot
2,7,377,2082.0,STEEBot
4,11,334,1829.0,STEEBot
6,14,403,2136.0,STEEBot
9,27,453,2258.0,HastyBot


In [None]:
# Bot scores don't change much, I think we should focus on how the player
# rakings were calculated.
# Put their info in players feature to show the level and rank dif instead

# Skip next 2 lines if you still want to calculate bots.
df = df[~df['nickname'].isin(bot_names)] #take out the bots
df = df.merge(bot_df, on="game_id") #add in bot information

# merge game information
df = df.merge(games, on="game_id")
df["created_at"] = pd.to_datetime(df["created_at"])
df.head()



test_df = test_df[~test_df['nickname'].isin(bot_names)] #take out the bots
test_df = test_df.merge(test_bot_df, on="game_id") #add in bot information

# merge game information
test_df = test_df.merge(games, on="game_id")
test_df["created_at"] = pd.to_datetime(test_df["created_at"])
test_df.head()

Unnamed: 0,game_id,nickname,score,rating,total_score,avg_point,max_point,avg_move_len,max_move_len,difficult_words,...,time_control_name,game_end_reason,winner,created_at,lexicon,initial_time_seconds,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds
0,2,Super,488,,488,34.857143,94,3.857143,7,5,...,regular,STANDARD,1,2022-08-10 19:19:59,CSW21,3600,0,RATED,1,364.214418
1,7,Priya1,379,,379,29.153846,68,3.384615,7,1,...,regular,STANDARD,1,2022-08-26 03:07:48,CSW21,1260,0,RATED,1,385.599607
2,11,TileRunner,462,,462,33.0,66,3.928571,7,3,...,regular,STANDARD,1,2022-08-22 01:13:10,NWL20,1200,0,CASUAL,1,501.739156
3,14,Anfield223,359,,359,23.933333,98,3.266667,7,1,...,regular,STANDARD,0,2022-09-14 14:06:24,CSW21,900,0,RATED,1,293.253051
4,27,friesbasil,456,,456,41.454545,90,4.0,7,1,...,regular,STANDARD,1,2022-09-13 09:12:15,CSW21,1200,0,RATED,1,203.96323


### game level

In [None]:
# define game level
conditions = [
    (df['nickname'] == "BetterBot") | (df['first'] == "BetterBot"),
    (df['nickname'] == "STEEBot") | (df['first'] == "STEEBot"),
    (df['nickname'] == "HastyBot") | (df['first'] == "HastyBot")
]

choices = [1, 2, 3]

df['game_level'] = np.select(conditions, choices, default=0)
df.head(5)

Unnamed: 0,game_id,nickname,score,rating,total_score,avg_point,max_point,avg_move_len,max_move_len,difficult_words,...,game_end_reason,winner,created_at,lexicon,initial_time_seconds,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds,game_level
0,1,stevy,429,1500,429,30.642857,98,3.428571,7,1,...,STANDARD,1,2022-08-26 03:38:49,NWL20,1200,0,CASUAL,1,674.844274,1
1,3,davidavid,440,1811,440,31.428571,103,3.857143,7,2,...,STANDARD,1,2022-09-04 08:04:27,CSW21,900,0,RATED,5,492.268262,1
2,4,Inandoutworker,119,1473,119,8.5,26,1.857143,6,1,...,RESIGNED,0,2022-09-12 02:36:19,CSW21,3600,0,CASUAL,1,350.861141,1
3,5,stevy,325,1500,325,20.3125,51,2.875,5,1,...,STANDARD,0,2022-09-06 04:31:36,NWL20,1200,0,CASUAL,1,642.688722,2
4,6,HivinD,378,2029,378,31.5,74,4.083333,6,2,...,STANDARD,0,2022-08-21 14:56:35,CSW21,900,0,RATED,1,426.950541,0


### First player

In [None]:
# first player or not
conditions = [
    (df['nickname'] == df['first']),
    (df['nickname'] != df['first']),
]

choices = [1, 0]

df['is_first_player'] = np.select(conditions, choices, default=0)
df.head(5)

Unnamed: 0,game_id,nickname,score,rating,total_score,avg_point,max_point,avg_move_len,max_move_len,difficult_words,...,winner,created_at,lexicon,initial_time_seconds,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds,game_level,is_first_player
0,1,stevy,429,1500,429,30.642857,98,3.428571,7,1,...,1,2022-08-26 03:38:49,NWL20,1200,0,CASUAL,1,674.844274,1,0
1,3,davidavid,440,1811,440,31.428571,103,3.857143,7,2,...,1,2022-09-04 08:04:27,CSW21,900,0,RATED,5,492.268262,1,0
2,4,Inandoutworker,119,1473,119,8.5,26,1.857143,6,1,...,0,2022-09-12 02:36:19,CSW21,3600,0,CASUAL,1,350.861141,1,0
3,5,stevy,325,1500,325,20.3125,51,2.875,5,1,...,0,2022-09-06 04:31:36,NWL20,1200,0,CASUAL,1,642.688722,2,0
4,6,HivinD,378,2029,378,31.5,74,4.083333,6,2,...,0,2022-08-21 14:56:35,CSW21,900,0,RATED,1,426.950541,0,1


In [None]:
# get the only 1500 rating players and drop them
users_1500 = df[df["rating"] == 1500]["nickname"]
anomalous = df[df["nickname"].isin(users_1500)].groupby("nickname").\
    agg({'nickname':'count', 'rating' : lambda x : np.sum(x == 1500)})

anomalous["ratio"] = anomalous["rating"] / anomalous["nickname"]
anomalous_users = anomalous[(anomalous["ratio"] >= 1.0) & (anomalous["nickname"] > 1)].index
df = df[~df["nickname"].isin(anomalous_users)]


### One Hot Encodeing

In [None]:
import category_encoders as ce
encoder = ce.OneHotEncoder(cols=["bot_name", "rating_mode", "lexicon", "game_end_reason"], use_cat_names=True)
df = df.join(encoder.fit_transform(df[["bot_name", "rating_mode", "lexicon", "game_end_reason"]]))
test_df = test_df.join(encoder.fit_transform(test_df[["bot_name", "rating_mode", "lexicon", "game_end_reason"]]))

In [None]:
df.head()

Unnamed: 0,game_id,nickname,score,rating,total_score,avg_point,max_point,avg_move_len,max_move_len,difficult_words,...,rating_mode_RATED,rating_mode_CASUAL,lexicon_CSW21,lexicon_NWL20,lexicon_ECWL,lexicon_NSWL20,game_end_reason_STANDARD,game_end_reason_RESIGNED,game_end_reason_TIME,game_end_reason_CONSECUTIVE_ZEROES
1,3,davidavid,440,1811,440,31.428571,103,3.857143,7,2,...,1,0,1,0,0,0,1,0,0,0
2,4,Inandoutworker,119,1473,119,8.5,26,1.857143,6,1,...,0,1,1,0,0,0,0,1,0,0
4,6,HivinD,378,2029,378,31.5,74,4.083333,6,2,...,1,0,1,0,0,0,1,0,0,0
5,8,AliSalman1,414,2067,414,37.636364,91,4.272727,7,3,...,1,0,1,0,0,0,1,0,0,0
6,9,cccc,364,1641,364,24.266667,88,3.466667,7,2,...,1,0,0,1,0,0,1,0,0,0


In [None]:
df.columns

Index(['game_id', 'nickname', 'score', 'rating', 'total_score', 'avg_point',
       'max_point', 'avg_move_len', 'max_move_len', 'difficult_words',
       'difficult_letters', 'medium_letters', 'easy_letters', 'blank_used',
       'bingo_cnt', 'total_bonus', 'avg_bonus', 'bot_score', 'bot_rating',
       'bot_name', 'first', 'time_control_name', 'game_end_reason', 'winner',
       'created_at', 'lexicon', 'initial_time_seconds', 'increment_seconds',
       'rating_mode', 'max_overtime_minutes', 'game_duration_seconds',
       'bot_name_BetterBot', 'bot_name_STEEBot', 'bot_name_HastyBot',
       'rating_mode_RATED', 'rating_mode_CASUAL', 'lexicon_CSW21',
       'lexicon_NWL20', 'lexicon_ECWL', 'lexicon_NSWL20',
       'game_end_reason_STANDARD', 'game_end_reason_RESIGNED',
       'game_end_reason_TIME', 'game_end_reason_CONSECUTIVE_ZEROES'],
      dtype='object')

In [None]:
def win_or_not(row):
    win = 0
    if row['nickname'] == row['first'] and row['winner'] == 1:
        win = 1
        return win
    elif row['nickname'] != row['first'] and row['winner'] == 0:
        win = 1
        return win
    else:
        return win

df['win_or_not'] = df.apply(win_or_not, axis = 1)
test_df['win_or_not'] = test_df.apply(win_or_not, axis = 1)

### Cummulative

In [None]:
def cumm_player_features(df, cat_features):
    game_features = ["nickname", "created_at","score","winner", "game_duration_seconds"]
    # cat_features = ["bot_name", "rating_mode", "lexicon", "game_end_reason"]
    # df = df[cat_features]

    #sort by the times of the games so that we aggregate over time in the ensuing steps
    df= df.sort_values(by=["nickname", "created_at"])
    length = len(df)
    #Initialize our new variables with 0's
    # df["cumm_avg_player_score"] = np.zeros(length)
    # df["cumm_player_wins"] = np.zeros(length)
    # df["cumm_avg_player_win_ratio"] = np.zeros(length)
    # df["cumm_avg_game_duration_seconds"] = np.zeros(length)

    # rolling avg for past 10 games(not include current one)

    df['rolling_score_avg'] = (df.groupby('nickname')['score'].rolling(window=11, min_periods=1).sum().reset_index(level=0, drop=True) - df['score']) / (df.groupby('nickname')['score'].rolling(window=11, min_periods=1).count().reset_index(level=0, drop=True) - 1)
    df['rolling_win'] = (df.groupby('nickname')['winner'].rolling(window=11, min_periods=1).sum().reset_index(level=0, drop=True) - df['winner'])
    df['rolling_win_rate'] = df['rolling_win'] / (df.groupby('nickname')['score'].rolling(window=11, min_periods=1).count().reset_index(level=0, drop=True) - 1)
    # df['rolling_score_avg'] = (df.groupby('nickname')['score'].rolling(window=11, min_periods=1).sum().reset_index(level=0, drop=True) - df['score']) / (df.groupby('nickname')['score'].rolling(window=11, min_periods=1).count().reset_index(level=0, drop=True) - 1)



    # for nickname in df["nickname"].unique():

    #     df.loc[df["nickname"]==nickname, "cumm_avg_player_score"]= np.append(0, df[df["nickname"]==nickname]["score"].expanding(min_periods=1).mean().values[:-1])
    #     df.loc[df["nickname"]==nickname, "cumm_player_wins"]= np.append(0, df[df["nickname"]==nickname]["winner"].expanding(min_periods=1).sum().values[:-1])
    #     df.loc[df["nickname"]==nickname, "cumm_avg_player_win_ratio"]= df[df["nickname"]==nickname]["cumm_player_wins"] / np.append(0, df[df["nickname"]==nickname]["winner"].expanding(min_periods=1).count().values[:-1])
    #     df.loc[df["nickname"]==nickname, "cumm_avg_game_duration_seconds"]= np.append(0, df[df["nickname"]==nickname]["game_duration_seconds"].expanding(min_periods=2).mean().values[:-1])

    # #fill in any missing values with 0
    # df[["cumm_avg_player_score", "cumm_player_wins", "cumm_avg_player_win_ratio", "cumm_avg_game_duration_seconds"]] = \
    # df[["cumm_avg_player_score", "cumm_player_wins", "cumm_avg_player_win_ratio", "cumm_avg_game_duration_seconds"]].fillna(0)


    # for feature in cat_features:
    #     df[f"cumm_{feature}"] = n[.zeros(length)

    # for nickname in df["nickname"].unique():
    #     for feature in cat_features:
    #         df.loc[df["nickname"]==nickname, f"cumm_{feature}"]= np.append(0, df[df["nickname"]==nickname][feature_name].expanding(min_periods=1).sum().values[:-1])



    df = df.sort_index()
    return df[["rolling_score_avg", "rolling_win","rolling_win_rate"]] #
    # return df[["cumm_avg_player_score", "cumm_player_wins", "cumm_avg_player_win_ratio", "cumm_avg_game_duration_seconds"]]

# Testing Performance

Set a scorer to test if it improves performances by adding features

In [None]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_validate, KFold, RepeatedKFold
def xgb_score(X, y):
    # delete "device='cuda'" if you are using CPU
    model=XGBRegressor(device='cuda', n_estimators=1000, random_state=42)
    scores = cross_validate(
        model, X, y, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True
    )
    return {"Training":-1 * np.mean(scores["train_score"]), "Validation":-1 * np.mean(scores["test_score"])}

def lgbm_score(X, y):
    model=LGBMRegressor(n_estimators=1000, verbose=-1, random_state=42)
    scores = cross_validate(
        model, X, y, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error', return_train_score=True
    )

    return {"Training":-1 * np.mean(scores["train_score"]), "Validation":-1 * np.mean(scores["test_score"])}

In [None]:
cat_features = encoder.get_feature_names()
X = df.join(cumm_player_features(df.copy(), cat_features))
X = X.drop(["game_id", "nickname", "first", "time_control_name", "winner", "created_at", 'increment_seconds', "game_duration_seconds","bot_name", "rating_mode", "lexicon", "game_end_reason"], axis=1)
X = X.drop(['rating'], axis=1)
y = df["rating"]



In [None]:
test_X = test_df.join(cumm_player_features(test_df.copy(), cat_features))
test_X = test_X.drop(["game_id", "nickname", "first", "time_control_name", "winner", "created_at", 'increment_seconds', "game_duration_seconds","bot_name", "rating_mode", "lexicon", "game_end_reason"], axis=1)
test_X = test_X.drop(['rating'], axis=1)

In [None]:
test_X.head()

Unnamed: 0,score,total_score,avg_point,max_point,avg_move_len,max_move_len,difficult_words,difficult_letters,medium_letters,easy_letters,...,lexicon_NWL20,lexicon_ECWL,game_end_reason_STANDARD,game_end_reason_TIME,game_end_reason_RESIGNED,game_end_reason_CONSECUTIVE_ZEROES,win_or_not,rolling_score_avg,rolling_win,rolling_win_rate
0,488,488,34.857143,94,3.857143,7,5,2,13,38,...,0,0,1,0,0,0,1,393.8,7.0,0.7
1,379,379,29.153846,68,3.384615,7,1,2,12,29,...,0,0,1,0,0,0,1,398.9,3.0,0.3
2,462,462,33.0,66,3.928571,7,3,5,12,37,...,1,0,1,0,0,0,1,402.5,6.0,0.6
3,359,359,23.933333,98,3.266667,7,1,2,9,37,...,0,0,1,0,0,0,1,348.2,1.0,0.1
4,456,456,41.454545,90,4.0,7,1,3,9,31,...,0,0,1,0,0,0,1,456.1,3.0,0.3


In [None]:
X[X["nickname"]=='davidavid']

Unnamed: 0,game_id,nickname,cumm_avg_player_score,rolling_avg
41700,60229,davidavid,0.000000,
12052,17347,davidavid,337.000000,337.000000
46486,67116,davidavid,386.500000,386.500000
38994,56325,davidavid,394.666667,394.666667
14936,21508,davidavid,378.500000,378.500000
...,...,...,...,...
23246,33492,davidavid,345.666667,359.000000
26678,38494,davidavid,345.780000,354.800000
38206,55181,davidavid,345.840637,356.500000
3977,5744,davidavid,346.059524,365.500000


In [None]:
xgb_score(X, y)

{'Training': 50.77667262496884, 'Validation': 96.69235461376904}

In [None]:
lgbm_score(X, y)

{'Training': 70.81356985906552, 'Validation': 91.2938016220982}

In [None]:
param = {
        "objective": "regression",
        "verbosity": -1,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
}

In [None]:
X.columns

Index(['score', 'total_score', 'avg_point', 'max_point', 'avg_move_len',
       'max_move_len', 'difficult_words', 'difficult_letters',
       'medium_letters', 'easy_letters', 'blank_used', 'bingo_cnt',
       'total_bonus', 'avg_bonus', 'bot_score', 'bot_rating',
       'initial_time_seconds', 'max_overtime_minutes', 'bot_name_BetterBot',
       'bot_name_STEEBot', 'bot_name_HastyBot', 'rating_mode_RATED',
       'rating_mode_CASUAL', 'lexicon_CSW21', 'lexicon_NWL20', 'lexicon_ECWL',
       'game_end_reason_STANDARD', 'game_end_reason_RESIGNED',
       'game_end_reason_TIME', 'game_end_reason_CONSECUTIVE_ZEROES',
       'win_or_not', 'rolling_score_avg', 'rolling_win', 'rolling_win_rate'],
      dtype='object')

In [None]:
X = X.drop(['lexicon_NSWL20'], axis=1)

In [None]:
test_X.columns

Index(['score', 'total_score', 'avg_point', 'max_point', 'avg_move_len',
       'max_move_len', 'difficult_words', 'difficult_letters',
       'medium_letters', 'easy_letters', 'blank_used', 'bingo_cnt',
       'total_bonus', 'avg_bonus', 'bot_score', 'bot_rating',
       'initial_time_seconds', 'max_overtime_minutes', 'bot_name_BetterBot',
       'bot_name_STEEBot', 'bot_name_HastyBot', 'rating_mode_RATED',
       'rating_mode_CASUAL', 'lexicon_CSW21', 'lexicon_NWL20', 'lexicon_ECWL',
       'game_end_reason_STANDARD', 'game_end_reason_TIME',
       'game_end_reason_RESIGNED', 'game_end_reason_CONSECUTIVE_ZEROES',
       'win_or_not', 'rolling_score_avg', 'rolling_win', 'rolling_win_rate'],
      dtype='object')

In [None]:
test_X.shape

(22363, 34)

In [None]:
names = xgb_model.get_booster().feature_names
test_X = test_X[names]

In [None]:
xgb_model = XGBRegressor(device='cuda', colsample_bytree =0.7, learning_rate = 0.08968078915104692, max_depth = 4, n_estimators =  150, reg_alpha = 1, reg_lambda = 1, subsample = 0.9)
xgb_model.fit(X, y)
predictions = xgb_model.predict(test_X)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [None]:
test = pd.read_csv("test.csv")
test = test[~test['nickname'].str.endswith('Bot')]
test['rating'] = predictions
#test

submission = test[['game_id','rating']]
submission.to_csv("xgboost_submission.csv",index=False)