# 概要

- データ読み込み
- ノイズ除去

---

- embedding : 単語 リスト作成
- index作成 {var: \[repo, link, star, fork, \] }

In [110]:
def read_var_data(path):
    var_data = open(path, 'r').readlines()
    var_data = [line.replace('\n', '') for line in var_data]
    url, repo_name,  star, fork = var_data[0].split(',')
    var_list = var_data[1:-1]
    var_set = set(var_list)
    return {
        'url': url,
        'repo_name': repo_name,
        'star': star,
        'fork': fork,
        'var_set': var_set
    }

In [13]:
var_data = open('variables.data', 'r').readlines()
var_data = [line.replace('\n', '') for line in var_data]

In [109]:
url, repo_name,  star, fork = var_data[0].split(',')
url, repo_name,  star, fork

('https://github.com/python/cpython', ' cpython', ' 26274', ' 11291')

In [19]:
var_list = var_data[1:-1]

In [21]:
len(var_list)

141442

In [22]:
val_set = set(var_list)

In [132]:
len(val_set)

36381

In [111]:
var_data = read_var_data('../data/repos/variables.data')

# ノイズ除去
- 英字, _以外のモノが含まれていたら除外
- 大文字があったら除外

In [200]:
import re
def is_var(text):
    # pattern = '[^a-zA-Z_]'
    pattern = '[^a-z_]+'
    search_result = re.search(pattern, text)
    return not search_result

In [201]:
var_set = [var for var in val_set if is_var(var)]

In [202]:
len(var_set)

16621

In [204]:
# var_set

# 埋め込みリスト生成

In [68]:
import fasttext
model = fasttext.load_model('../model/wiki-news-300d-1M-subword.bin')




In [181]:
import numpy as np
def get_word_list_vector(word_list, model):
    # 文ベクトルの取得
    assert type(word_list) == list
    return np.sum([model.get_word_vector(word) for word in word_list], axis=0)

In [90]:
import re

In [108]:
def parse_var(var):
    # もしa, A混在していたら大文字でsplit
    if re.search('[A-Z]', var) and re.search('[a-z]', var):
        p_list = re.findall('[A-Z][^A-Z]*', var) 
    else:
        p_list = [var]
    
    p_list = [v.split('_') for v in p_list]
        
    # lower
    _p_list = []
    for v_list in p_list:
        _p_list += [v.lower() for v in v_list]
    return _p_list

In [113]:
def parse_var_emb_list(var_set):
    var_parsed_list = [parse_var(var) for var in var_set]
    var_pw_list = list(zip(var_parsed_list, var_set))
    var_pw_emb_list = []
    for pv, v in var_pw_list:
        var_pw_emb_list.append({
            'vector': get_word_list_vector(pv, model),
            'var': v,
            'parsed_var_list': pv,
        })
    return var_pw_emb_list

In [None]:
# 保存
import pickle
with open('../data/parsed_var_list.pickle', mode='wb') as f:
    pickle.dump(var_pw_emb_list , f)

# まとめてやる

In [116]:
import glob

In [206]:
%%time
# 埋め込み辞書作成
# TODO: repoを横断して集計
var_pw_emb_list = parse_var_emb_list(var_set)

CPU times: user 425 ms, sys: 17.8 ms, total: 443 ms
Wall time: 444 ms


In [207]:
_dir = './repos/*'
var_path_list = glob.glob(_dir)

In [209]:
def read_parse_var_info_data(_dir):
    # ディレクトリ内を網羅でやる
    var_path_list = glob.glob(_dir)
    
    # embedding 類似度検索用
    all_var_pw_emb_list = []
    # 単語情報取得用
    word_repo_dict = {}
    for path in var_path_list:
        var_data = read_var_data(path)
        var_set = [var for var in var_data['var_set'] if is_var(var)]
        all_var_pw_emb_list += parse_var_emb_list(var_set)
        for var in var_set:
            word_repo_dict[var] = {k: v for k,v in var_data.items() if k != 'var_set'}
            
    return all_var_pw_emb_list, word_repo_dict

In [210]:
all_var_pw_emb_list, word_repo_dict = read_parse_var_info_data('./repos/*')

In [226]:
# list(word_repo_dict.items())[:10]

In [227]:
# 保存
import pickle
save = lambda obj, path: pickle.dump(obj, open(path, mode='wb'))
save(all_var_pw_emb_list, '../data/all_var_pw_emb_list.pickle')
save(word_repo_dict, '../data/word_repo_dict.pickle')

# 検索

In [212]:
import numpy as np

def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def search_by_query(query, model):
    query_vector = get_word_list_vector(query, model)
    for var_dict in all_var_pw_emb_list:
        var_dict['cos_sim'] = cos_sim(query_vector, var_dict['vector'])
    sorted_sim_var_list = sorted(all_var_pw_emb_list, key=lambda x:  x['cos_sim'], reverse=True)
    return sorted_sim_var_list

In [216]:
# query = 'num of product'.split()
# query_vector = get_word_list_vector(query, model)
# %%time
# for var_dict in all_var_pw_emb_list:
#     var_dict['cos_sim'] = cos_sim(query_vector, var_dict['vector'])
# sorted_sim_var_list = sorted(all_var_pw_emb_list, key=lambda x:  x['cos_sim'], reverse=True)

In [218]:
import pandas as pd

In [225]:
query =  'componen'.split()
sorted_sim_var_list = search_by_query(query, model)
pd.DataFrame(sorted_sim_var_list)

  after removing the cwd from sys.path.


Unnamed: 0,cos_sim,parsed_var_list,var,vector
0,,[],,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1.000000,[product],product,"[-0.0028900737, 0.0009795268, 0.0039173015, 0...."
2,0.771931,"[product, name]",product_name,"[0.0056655696, -0.015280259, -0.007267527, 0.0..."
3,0.667101,"[actual, output, content]",actual_output_content,"[0.03152828, 0.008219921, 0.009493078, 0.03468..."
4,0.665752,"[test, package, name]",test_package_name,"[-0.006639906, -0.014026473, 0.019891784, 0.06..."
5,0.655208,"[imports, group]",imports_group,"[0.027722739, 0.015731692, 0.02038173, -0.0034..."
6,0.651239,"[current, package]",current_package,"[-0.0072340174, 0.008042887, 0.007218374, 0.02..."
7,0.648472,"[current, process]",current_process,"[-0.028167255, -0.0065146815, 0.0126821585, 0...."
8,0.648472,"[, current, process]",_current_process,"[-0.028167255, -0.0065146815, 0.0126821585, 0...."
9,0.647383,"[test, package, over, module]",test_package_over_module,"[-0.017164292, -0.02351828, 0.020381952, 0.016..."
