In [None]:
# import pandas as pd

# # Path to your original CSV file
# input_csv_path = 'data/java_test_dataset_code.csv'

# # Path where you want to save the new CSV file
# output_csv_path = 'data/java_2000_dataset_code.csv'

# # Specify the number of rows to skip if you want to start from a specific row other than the first
# rows_to_skip = 0  # Change this as needed

# # Specify the chunk size
# chunk_size = 2000

# # Use nrows to specify the number of rows to read starting from rows_to_skip
# df_chunk = pd.read_csv(input_csv_path, skiprows=range(1, rows_to_skip + 1), nrows=chunk_size)

# # Save the chunk to a new CSV file
# df_chunk.to_csv(output_csv_path, index=False)

# print(f'Chunk of {chunk_size} rows saved to {output_csv_path}.')

In [1]:
import json
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from pathlib import Path
from collections import defaultdict
from features.utils import build_mapping_to_ids

warnings.filterwarnings('ignore')

# Data

### Get all problems

In [2]:
dataset = pd.read_csv("data/java_2000_dataset_code.csv")

In [3]:
# Drop rows where null values exist in 'focal_class_code' or 'test_class_code'
dataset.dropna(subset=['focal_class_code', 'test_class_code'], inplace=True)

In [4]:
dataset.head()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1929 entries, 0 to 1999
Data columns (total 40 columns):
repo_id                                1929 non-null int64
url                                    1929 non-null object
language                               1569 non-null object
fork_count                             1929 non-null int64
stargazer_count                        1929 non-null int64
focal_class_identifier                 1929 non-null object
focal_class_superclass                 633 non-null object
focal_class_interfaces                 885 non-null object
focal_class_fields                     1929 non-null object
focal_class_methods                    1929 non-null object
focal_class_file                       1929 non-null object
focal_method_identifier                1929 non-null object
focal_method_parameters                1929 non-null object
focal_method_modifiers                 1841 non-null object
focal_method_return                    1900 non-null objec

In [5]:
dataset.isnull().sum()

repo_id                                   0
url                                       0
language                                360
fork_count                                0
stargazer_count                           0
focal_class_identifier                    0
focal_class_superclass                 1296
focal_class_interfaces                 1044
focal_class_fields                        0
focal_class_methods                       0
focal_class_file                          0
focal_method_identifier                   0
focal_method_parameters                   0
focal_method_modifiers                   88
focal_method_return                      29
focal_method_body                         0
focal_method_signature                    0
focal_method_full_signature               0
focal_method_class_method_signature       0
focal_method_testcase                     0
focal_method_constructor                  0
focal_method_invocations                  0
test_class_identifier           

# Build dataset

In [47]:
from features import *
from sklearn.feature_selection import mutual_info_regression

In [53]:
# codes = dataset['focal_class_code'].values  # Assuming 'code_column' is the name of your column with the code

codes_with_ids = [{'repo_id': row['repo_id'], 'code': row['focal_class_code']} for index, row in dataset.iterrows()]

samples = calculate_features_for_files(codes_with_ids)

In [None]:
samples

### Minor EDA for samples

In [55]:
xdf = pd.DataFrame(samples) 

In [56]:
xdf.shape

(1929, 14290)

In [58]:
column_name = 'repo_id'
# Pop the column out of the DataFrame
desired_column = xdf.pop(column_name)
# Reinsert it at the beginning of the DataFrame
xdf.insert(0, column_name, desired_column)

In [59]:
xdf.head()

Unnamed: 0,repo_id,WordUnigramTF_csv,WordUnigramTF_converter,WordUnigramTF_de,WordUnigramTF_bytefish,WordUnigramTF_jtinycsvparser,WordUnigramTF_typeconverter,WordUnigramTF_ITypeConverter,WordUnigramTF_utils,WordUnigramTF_StringUtils,...,WordUnigramTF_getOnlyThreadPoolUtilization,WordUnigramTF_sampleCommandUtilization,WordUnigramTF_sampleThreadPoolUtilization,WordUnigramTF_commandUtilizationPerKey,WordUnigramTF_threadPoolUtilizationPerKey,WordUnigramTF_hystrixUtilization,WordUnigramTF_getCommandUtilizationMap,WordUnigramTF_getThreadPoolUtilizationMap,WordUnigramTF_doThrow,WordUnigramTF_RollingCollapserBatchSizeDistributionStream
0,58314354.0,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.032787,0.016393,0.032787,...,,,,,,,,,,
1,58314354.0,,,,,,,,0.016129,,...,,,,,,,,,,
2,58314354.0,,,,,,,,0.016129,,...,,,,,,,,,,
3,58314354.0,,,,,,,,0.016129,,...,,,,,,,,,,
4,58314354.0,0.029703,0.009901,,,,,,0.009901,,...,,,,,,,,,,


In [46]:
columns_to_check = [
    "WordUnigramTF",
    "In(numkeywords/length)",
    "In(numTernary/length)",
    "In(numTokens/length)",
    "In(numComments/length)",
    "In(numLiterals/length)",
    "In(numKeywords/length)",
    "In(numFunctions/length)",
    "In(numMacros/length)",
    "nestingDepth",
    "branchingFactor",
    "avgParams",
    "stdDevNumParams",
    "avgLineLength",
    "stdDevLineLength",
    "In(numTabs/length)",
    "In(numSpaces/length)",
    "In(numEmptyLines/length)",
    "whiteSpaceRatio",
    "newLineBeforeOpenBrace",
    "tabsLeadLines",
    "MaxDepthASTNode",
    "ASTNodeBigramsTF",
    "ASTNodeTypesTF",
    "ASTNodeTypesTFIDF",
    "ASTNodeTypeAvgDep",
    "cppKeywords",
    "CodeInASTLeavesTF",
    "CodeInASTLeavesTFIDF",
    "CodeInASTLeavesAvgDep"
]

# Function to clean column names
def clean_column_name(name):
    return name.replace(" ", "").replace("In", "ln").lower()

# Clean DataFrame column names
xdf.columns = [clean_column_name(name) for name in xdf.columns]

# Check each column
for col in columns_to_check:
    cleaned_col = clean_column_name(col)
    if cleaned_col in xdf.columns:
        print(f"Column '{col}' exists in the DataFrame.")
    else:
        print(f"Column '{col}' does NOT exist in the DataFrame.")

Column 'WordUnigramTF' does NOT exist in the DataFrame.
Column 'In(numkeywords/length)' exists in the DataFrame.
Column 'In(numTernary/length)' exists in the DataFrame.
Column 'In(numTokens/length)' exists in the DataFrame.
Column 'In(numComments/length)' does NOT exist in the DataFrame.
Column 'In(numLiterals/length)' exists in the DataFrame.
Column 'In(numKeywords/length)' exists in the DataFrame.
Column 'In(numFunctions/length)' exists in the DataFrame.
Column 'In(numMacros/length)' does NOT exist in the DataFrame.
Column 'nestingDepth' does NOT exist in the DataFrame.
Column 'branchingFactor' does NOT exist in the DataFrame.
Column 'avgParams' exists in the DataFrame.
Column 'stdDevNumParams' exists in the DataFrame.
Column 'avgLineLength' exists in the DataFrame.
Column 'stdDevLineLength' exists in the DataFrame.
Column 'In(numTabs/length)' exists in the DataFrame.
Column 'In(numSpaces/length)' exists in the DataFrame.
Column 'In(numEmptyLines/length)' exists in the DataFrame.
Col

In [44]:
# Extract all column names
# Assuming df is your DataFrame
column_names = xdf.columns.tolist()

# Specify the file path where you want to save the column names
file_path = 'column_names.txt'

# Write column names to the text file
with open(file_path, 'w') as f:
    for column_name in column_names:
        f.write(column_name + '\n')

print("Column names have been saved to", file_path)

Column names have been saved to column_names.txt


### Samples dictionary cleaning
Replacing all "General error" messages with np.nan

In [30]:
from typing import List, Dict

def preprocess_samples(samples: List[Dict]) -> List[Dict]:
    for sample in samples:
        if 'error' in sample:
            # Option 1: Remove the sample entirely
            # samples.remove(sample)
            
            # Option 2: Replace error entries with NaN or another placeholder
            for key in sample:
                sample[key] = np.nan  # or a specific placeholder value
    return samples

In [None]:
# Preprocess the samples to handle errors
samples = preprocess_samples(samples)

In [43]:
X = build_dataset(samples)
y = dataset.repo_id.values

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

Number of samples: 1929
Number of features: 14289


### Select the best 1500 features according to mutual information

In [32]:
mi = mutual_info_regression(np.nan_to_num(X), y, random_state=0)
mi /= np.max(mi)

- 5:26 min for a 2000 rows file
- Estimation: 80,000 rows -> 210 min / 625,000 rows -> 1,640 min

In [33]:
mi_indices = np.argsort(mi)
features_indices = mi_indices[-1500:]
features = X.columns[features_indices].values
X = X[features]

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

Number of samples: 1929
Number of features: 1500


### Select top 1500 popular features

In [34]:
nan_count = X.isna().sum(axis=0)
indices = np.argsort(nan_count.values)
features = nan_count[indices][:1500].index
X = X[features]

print(f'Number of samples: {X.shape[0]}')
print(f'Number of features: {X.shape[1]}')

Number of samples: 1929
Number of features: 1500


In [35]:
X.head()

Unnamed: 0,whiteSpaceRatio,stdDevNumParams,avgParams,ASTNodeTypesTF_CompilationUnit,ASTNodeTypesTF_ClassDeclaration,javaKeywords_class,ASTNodeBigramsTF_CompilationUnit_ClassDeclaration,ASTNodeTypesTF_MethodDeclaration,javaKeywords_public,MaxDepthASTNode,...,WordUnigramTF_isWritable,WordUnigramTF_IFirewallService,WordUnigramTF_latAll,WordUnigramTF_ChannelOption,WordUnigramTF_updateAssign,WordUnigramTF_lookup3,WordUnigramTF_asLong,WordUnigramTF_getClientID,WordUnigramTF_upperOk,WordUnigramTF_getDryBulbCelsius
0,0.290123,0.5,0.5,0.014925,0.014925,0.08,0.015152,0.029851,0.2,10.0,...,,,,,,,,,,
1,0.211688,0.471405,2.333333,0.019608,0.019608,0.055556,0.02,0.058824,0.222222,7.0,...,,,,,,,,,,
2,0.211688,0.471405,2.333333,0.019608,0.019608,0.055556,0.02,0.058824,0.222222,7.0,...,,,,,,,,,,
3,0.211688,0.471405,2.333333,0.019608,0.019608,0.055556,0.02,0.058824,0.222222,7.0,...,,,,,,,,,,
4,0.181017,0.5,1.5,0.01,0.01,0.066667,0.010101,0.02,0.2,9.0,...,,,,,,,,,,0.009901


In [38]:
X.isnull().all(axis=1).sum() # Rows with all columns null

15

# Classification

In [39]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

In [41]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

for index, (train_index, valid_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    
    y_train = y_train.astype(str)
    y_valid = y_valid.astype(str)

    model = CatBoostClassifier(
        iterations=500, 
        learning_rate=0.2,
        rsm=0.01,
        depth=3,
        bootstrap_type='Bernoulli',
        subsample=0.7,
        loss_function='MultiClass'
    )
    
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid), plot=False, verbose=False)
    
    y_pred = model.predict(X_train).squeeze()
    train_acc = np.average(y_train == y_pred)

    y_pred = model.predict(X_valid).squeeze()
    valid_acc = np.average(y_valid == y_pred)

    print(f'Validation #{index + 1}')
    print(f'Train accuracy: {train_acc:.2f}')
    print(f'Valid accuracy: {valid_acc:.2f}\n')

Validation #1
Train accuracy: 0.00
Valid accuracy: 0.00

Validation #2
Train accuracy: 0.00
Valid accuracy: 0.00

Validation #3
Train accuracy: 0.00
Valid accuracy: 0.00

