In [1]:
import json
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm
from pathlib import Path
from collections import defaultdict
from features.utils import build_mapping_to_ids

warnings.filterwarnings('ignore')

# Data

### Get all problems

In [2]:
with open('data/metadata.json', 'r') as file:
    metadata = json.load(file)

In [3]:
dataset_path = Path('data/codejam/')
files = list(dataset_path.rglob('*.java'))

In [4]:
usernames = set(it.stem for it in files)
username_to_id = build_mapping_to_ids(usernames)

In [5]:
dataset = pd.DataFrame({
    'path': files,
    'round_id': [int(it.parts[2]) for it in files],
    'problem_id': [int(it.parts[3]) for it in files],
    'user_id': [username_to_id[it.stem] for it in files]
})

In [6]:
dataset.head()

Unnamed: 0,path,round_id,problem_id,user_id
0,data/codejam/639102/760487/Joshik.java,639102,760487,806
1,data/codejam/639102/760487/pashka.java,639102,760487,4013
2,data/codejam/639102/760487/Vegetable.java,639102,760487,1740
3,data/codejam/639102/760487/mystic.java,639102,760487,3834
4,data/codejam/639102/760487/vvn.java,639102,760487,4935


### Select 100 random users who has at least 9 files

In [7]:
N_FILES = 9
N_USERS = 100

In [8]:
random.seed(0)
np.random.seed(0)

In [9]:
count = dataset.groupby('user_id').problem_id.count()
users = count[count >= N_FILES].index
users = np.random.choice(users, N_USERS, replace=False)

In [10]:
parts = [dataset[dataset.user_id == user].sample(n=N_FILES, replace=False) for user in users]
dataset = pd.concat(parts).reset_index(drop=True)

# Create new user ids
user_id_to_new_id = build_mapping_to_ids(dataset.user_id)
dataset.user_id = dataset.user_id.apply(lambda it: user_id_to_new_id[it])

In [11]:
dataset.head()

Unnamed: 0,path,round_id,problem_id,user_id
0,data/codejam/32013/24480/Sputnik.java,32013,24480,29
1,data/codejam/32001/24440/Sputnik.java,32001,24440,29
2,data/codejam/635102/694485/Sputnik.java,635102,694485,29
3,data/codejam/635102/706485/Sputnik.java,635102,706485,29
4,data/codejam/635102/698485/Sputnik.java,635102,698485,29


# Build dataset

In [12]:
from features import *
from sklearn.feature_selection import mutual_info_regression

In [13]:
samples = calculate_features_for_files(dataset.path.values)

In [14]:
X = build_dataset(samples)
y = dataset.user_id.values

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

Number of samples: 900
Number of features: 5265


### Select the best 1500 features according to mutual information

In [None]:
mi = mutual_info_regression(np.nan_to_num(X), y, random_state=0)
mi /= np.max(mi)

In [None]:
mi_indices = np.argsort(mi)
features_indices = mi_indices[-1500:]
features = X.columns[features_indices].values
X = X[features]

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

### Select top 1500 popular features

In [15]:
nan_count = X.isna().sum(axis=0)
indices = np.argsort(nan_count.values)
features = nan_count[indices][:1500].index
X = X[features]

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

Number of samples: 900
Number of features: 1500


In [16]:
X.head()

Unnamed: 0,whiteSpaceRatio,ASTNodeBigramsTF_StatementExpression_MethodInvocation,ln(numFunctions/length),ln(numKeywords/length),ln(numLiterals/length),ln(numSpaces/length),ASTNodeBigramsTF_MethodDeclaration_FormalParameter,ln(numTabs/length),ln(numTernary/length),ln(numTokens/length),...,WordUnigramTF_build,WordUnigramTF_wanted,WordUnigramTF_prln,WordUnigramTF_maxX,WordUnigramTF_LOOP,WordUnigramTF_probability,WordUnigramTF_iterate,WordUnigramTF_order,WordUnigramTF_maxY,WordUnigramTF_ys
0,0.386897,0.025157,-7.022868,-3.655572,-4.314818,0.080214,0.006289,0.154189,-inf,-2.490269,...,,,,,,,,,,
1,0.367949,0.02649,-6.279459,-3.50687,-4.139393,0.114339,0.013245,0.113402,-6.972606,-2.603158,...,,,,,,,,,,
2,0.491986,0.01087,-7.018402,-3.463054,-3.552666,0.11131,0.003623,0.163832,-inf,-2.242083,...,,,,,,,,,,
3,0.422472,0.02439,-7.836765,-4.075565,-4.946393,0.07109,0.003484,0.17654,-inf,-2.736898,...,,,,,,,,,,
4,0.390135,0.002506,-7.34601,-3.338677,-3.357026,0.120645,0.002506,0.118065,-7.34601,-2.296154,...,,,,,,,,,,


# Classification

In [17]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

In [19]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

for index, (train_index, valid_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    model = CatBoostClassifier(
        iterations=500, 
        learning_rate=0.2,
        rsm=0.01,
        depth=3,
        bootstrap_type='Bernoulli',
        subsample=0.7,
        loss_function='MultiClass'
    )
    
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid), plot=False, verbose=False)
    
    y_pred = model.predict(X_train).squeeze()
    train_acc = np.average(y_train == y_pred)

    y_pred = model.predict(X_valid).squeeze()
    valid_acc = np.average(y_valid == y_pred)

    print(f'Validation #{index + 1}')
    print(f'Train accuracy: {train_acc:.2f}')
    print(f'Valid accuracy: {valid_acc:.2f}\n')

Validation #1
Train accuracy: 1.00
Valid accuracy: 0.93

Validation #2
Train accuracy: 1.00
Valid accuracy: 0.93

Validation #3
Train accuracy: 1.00
Valid accuracy: 0.97

