In [21]:
import json
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from pathlib import Path
from collections import defaultdict
from features.utils import build_mapping_to_ids

warnings.filterwarnings('ignore')

# Data

### Get all problems

In [2]:
with open('data/metadata.json', 'r') as file:
    metadata = json.load(file)

In [3]:
dataset_path = Path('data/codejam/')
files = list(dataset_path.rglob('*.java'))

In [4]:
usernames = set(it.stem for it in files)
username_to_id = build_mapping_to_ids(usernames)

In [5]:
dataset = pd.DataFrame({
    'path': files,
    'round_id': [int(it.parts[2]) for it in files],
    'problem_id': [int(it.parts[3]) for it in files],
    'user_id': [username_to_id[it.stem] for it in files]
})

In [2]:
dataset = pd.read_csv("data/java_test_dataset_code.csv")

In [7]:
dataset.head()

Unnamed: 0,repo_id,url,language,fork_count,stargazer_count,focal_class_identifier,focal_class_superclass,focal_class_interfaces,focal_class_fields,focal_class_methods,...,test_case_return,test_case_body,test_case_signature,test_case_full_signature,test_case_class_method_signature,test_case_testcase,test_case_constructor,test_case_invocations,focal_class_code,test_class_code
0,58314354,https://github.com/bytefish/JavaElasticSearchE...,Java,11,0,IgnoreMissingValuesConverter,,implements ITypeConverter<Float>,[{'original_string': 'private List<String> mis...,[{'identifier': 'IgnoreMissingValuesConverter'...,...,void,@Test\n public void returns_null_if_value_i...,void returns_null_if_value_is_missing(),@Test public void returns_null_if_value_is_mis...,IgnoreMissingValuesConverterTest.returns_null_...,True,False,"['assertEquals', 'convert', 'assertEquals', 'c...",// Copyright (c) Philipp Wagner. All rights re...,// Copyright (c) Philipp Wagner. All rights re...
1,58314354,https://github.com/bytefish/JavaElasticSearchE...,Java,11,0,DateUtilities,,,[],"[{'identifier': 'from', 'parameters': '(LocalD...",...,void,@Test\n public void generated_date_has_utc_...,void generated_date_has_utc_offset_when_none_i...,@Test public void generated_date_has_utc_offse...,DateUtilitiesTest.generated_date_has_utc_offse...,True,False,"['of', 'of', 'atTime', 'atOffset', 'from', 'as...",// Copyright (c) Philipp Wagner. All rights re...,// Copyright (c) Philipp Wagner. All rights re...
2,58314354,https://github.com/bytefish/JavaElasticSearchE...,Java,11,0,DateUtilities,,,[],"[{'identifier': 'from', 'parameters': '(LocalD...",...,void,@Test\n public void generated_date_has_give...,void generated_date_has_given_offset_when_offs...,@Test public void generated_date_has_given_off...,DateUtilitiesTest.generated_date_has_given_off...,True,False,"['of', 'of', 'ofHours', 'atTime', 'atOffset', ...",// Copyright (c) Philipp Wagner. All rights re...,// Copyright (c) Philipp Wagner. All rights re...
3,58314354,https://github.com/bytefish/JavaElasticSearchE...,Java,11,0,DateUtilities,,,[],"[{'identifier': 'from', 'parameters': '(LocalD...",...,void,@Test\n public void generated_date_has_give...,void generated_date_has_given_timezone_when_gi...,@Test public void generated_date_has_given_tim...,DateUtilitiesTest.generated_date_has_given_tim...,True,False,"['of', 'of', 'atTime', 'ofHours', 'atOffset', ...",// Copyright (c) Philipp Wagner. All rights re...,// Copyright (c) Philipp Wagner. All rights re...
4,58314354,https://github.com/bytefish/JavaElasticSearchE...,Java,11,0,LocalWeatherDataConverter,,,[],"[{'identifier': 'convert', 'parameters': '(csv...",...,void,@Test\n public void testConvert() throws Ex...,void testConvert(),@Test public void testConvert(),LocalWeatherDataConverterTest.testConvert(),True,False,"['setWban', 'setDate', 'of', 'setTime', 'of', ...",// Copyright (c) Philipp Wagner. All rights re...,// Copyright (c) Philipp Wagner. All rights re...


# Build dataset

In [10]:
from features import *
from sklearn.feature_selection import mutual_info_regression

In [None]:
codes = dataset['focal_class_code'].values  # Assuming 'code_column' is the name of your column with the code
samples = calculate_features_for_files(codes)

In [13]:
# samples = calculate_features_for_files(dataset.path.values)

In [14]:
X = build_dataset(samples)
y = dataset.repo_id.values

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

Number of samples: 900
Number of features: 5265


### Select the best 1500 features according to mutual information

In [None]:
mi = mutual_info_regression(np.nan_to_num(X), y, random_state=0)
mi /= np.max(mi)

In [None]:
mi_indices = np.argsort(mi)
features_indices = mi_indices[-1500:]
features = X.columns[features_indices].values
X = X[features]

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

### Select top 1500 popular features

In [15]:
nan_count = X.isna().sum(axis=0)
indices = np.argsort(nan_count.values)
features = nan_count[indices][:1500].index
X = X[features]

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

Number of samples: 900
Number of features: 1500


In [16]:
X.head()

Unnamed: 0,whiteSpaceRatio,ASTNodeBigramsTF_StatementExpression_MethodInvocation,ln(numFunctions/length),ln(numKeywords/length),ln(numLiterals/length),ln(numSpaces/length),ASTNodeBigramsTF_MethodDeclaration_FormalParameter,ln(numTabs/length),ln(numTernary/length),ln(numTokens/length),...,WordUnigramTF_build,WordUnigramTF_wanted,WordUnigramTF_prln,WordUnigramTF_maxX,WordUnigramTF_LOOP,WordUnigramTF_probability,WordUnigramTF_iterate,WordUnigramTF_order,WordUnigramTF_maxY,WordUnigramTF_ys
0,0.386897,0.025157,-7.022868,-3.655572,-4.314818,0.080214,0.006289,0.154189,-inf,-2.490269,...,,,,,,,,,,
1,0.367949,0.02649,-6.279459,-3.50687,-4.139393,0.114339,0.013245,0.113402,-6.972606,-2.603158,...,,,,,,,,,,
2,0.491986,0.01087,-7.018402,-3.463054,-3.552666,0.11131,0.003623,0.163832,-inf,-2.242083,...,,,,,,,,,,
3,0.422472,0.02439,-7.836765,-4.075565,-4.946393,0.07109,0.003484,0.17654,-inf,-2.736898,...,,,,,,,,,,
4,0.390135,0.002506,-7.34601,-3.338677,-3.357026,0.120645,0.002506,0.118065,-7.34601,-2.296154,...,,,,,,,,,,


# Classification

In [17]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

In [19]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

for index, (train_index, valid_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    model = CatBoostClassifier(
        iterations=500, 
        learning_rate=0.2,
        rsm=0.01,
        depth=3,
        bootstrap_type='Bernoulli',
        subsample=0.7,
        loss_function='MultiClass'
    )
    
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid), plot=False, verbose=False)
    
    y_pred = model.predict(X_train).squeeze()
    train_acc = np.average(y_train == y_pred)

    y_pred = model.predict(X_valid).squeeze()
    valid_acc = np.average(y_valid == y_pred)

    print(f'Validation #{index + 1}')
    print(f'Train accuracy: {train_acc:.2f}')
    print(f'Valid accuracy: {valid_acc:.2f}\n')

Validation #1
Train accuracy: 1.00
Valid accuracy: 0.93

Validation #2
Train accuracy: 1.00
Valid accuracy: 0.93

Validation #3
Train accuracy: 1.00
Valid accuracy: 0.97

