In [1]:
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
val = pd.read_csv('codes/sample_train.csv')
test = pd.read_csv('codes/test.csv')

In [3]:
class BaselineModel():
    def __init__(self, threshold=0.5):
        super(BaselineModel, self).__init__()
        self.threshold = threshold
        self.vocabulary = set()

    def get_vectorizer(self):
        return CountVectorizer(vocabulary=list(self.vocabulary))

    def fit(self, code):
        temp_vectorizer = CountVectorizer()
        temp_vectorizer.fit(code)
        self.vocabulary.update(temp_vectorizer.get_feature_names_out())
        self.vectorizer = self.get_vectorizer()

    def predict_proba(self, code1, code2):
        code1_vecs = self.vectorizer.transform(code1)
        code2_vecs = self.vectorizer.transform(code2)

        preds = []
        for code1_vec,code2_vec in tqdm(zip(code1_vecs,code2_vecs)):
            preds.append(cosine_similarity(code1_vec, code2_vec))
        preds = np.reshape(preds, len(preds))
        print('Done')

        return preds

    def predict(self, code1, code2):
        preds = self.predict_proba(code1, code2)
        preds = np.where(preds>self.threshold, 1, 0)

        return preds

In [4]:
train_code_paths = glob.glob('codes/train_code/*/*.cpp')

In [5]:
def read_cpp_code(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [6]:
model = BaselineModel(threshold=0.5)

In [7]:
for path in tqdm(train_code_paths):
    code = read_cpp_code(path)
    model.fit([code])

100%|█████████████████████████████████████████████████████████████████████████| 250000/250000 [35:01<00:00, 118.98it/s]


In [8]:
len(model.vocabulary)

113727

In [9]:
def get_accuracy(gt, preds):
    return (gt==preds).mean()

In [10]:
val_preds = model.predict(val['code1'], val['code2'])

20000it [00:16, 1202.83it/s]

Done





In [11]:
print(get_accuracy(val['similar'].values, val_preds))

0.60125


In [12]:
preds = model.predict(test['code1'], test['code2'])

595000it [08:37, 1150.31it/s]


Done
