# 概要

- データ読み込み
- ノイズ除去

---

- embedding : 単語 リスト作成
- index作成 {var: \[repo, link, star, fork, \] }

In [241]:
def read_var_data(path):
    var_data = open(path, 'r').readlines()
    var_data = [line.replace('\n', '') for line in var_data]
    url, repo_name,  star, fork = var_data[0].split(',')
    var_list = var_data[1:-1]
    var_set = set(var_list)
    return {
        'url': url,
        'repo_name': repo_name,
        'star': star,
        'fork': fork,
        'var_set': var_set
    }

In [243]:
# var_data = open('variables.data', 'r').readlines()
# var_data = [line.replace('\n', '') for line in var_data]

In [109]:
url, repo_name,  star, fork = var_data[0].split(',')
url, repo_name,  star, fork

('https://github.com/python/cpython', ' cpython', ' 26274', ' 11291')

In [19]:
var_list = var_data[1:-1]

In [21]:
len(var_list)

141442

In [22]:
val_set = set(var_list)

In [132]:
len(val_set)

36381

In [111]:
var_data = read_var_data('../data/repos/variables.data')

# ノイズ除去
- 英字, _以外のモノが含まれていたら除外
- 大文字があったら除外

In [245]:
import re
def is_var(text):
    # pattern = '[^a-zA-Z_]'
    pattern = '[^a-z_]+'
    search_result = re.search(pattern, text)
    return not search_result and text != ''

In [201]:
var_set = [var for var in val_set if is_var(var)]

In [202]:
len(var_set)

16621

In [204]:
# var_set

# 埋め込みリスト生成

In [68]:
import fasttext
model = fasttext.load_model('../model/wiki-news-300d-1M-subword.bin')




In [181]:
import numpy as np
def get_word_list_vector(word_list, model):
    # 文ベクトルの取得
    assert type(word_list) == list
    return np.sum([model.get_word_vector(word) for word in word_list], axis=0)

In [90]:
import re

In [108]:
def parse_var(var):
    # もしa, A混在していたら大文字でsplit
    if re.search('[A-Z]', var) and re.search('[a-z]', var):
        p_list = re.findall('[A-Z][^A-Z]*', var) 
    else:
        p_list = [var]
    
    p_list = [v.split('_') for v in p_list]
        
    # lower
    _p_list = []
    for v_list in p_list:
        _p_list += [v.lower() for v in v_list]
    return _p_list

In [113]:
def parse_var_emb_list(var_set):
    var_parsed_list = [parse_var(var) for var in var_set]
    var_pw_list = list(zip(var_parsed_list, var_set))
    var_pw_emb_list = []
    for pv, v in var_pw_list:
        var_pw_emb_list.append({
            'vector': get_word_list_vector(pv, model),
            'var': v,
            'parsed_var_list': pv,
        })
    return var_pw_emb_list

In [None]:
# 保存
import pickle
with open('../data/parsed_var_list.pickle', mode='wb') as f:
    pickle.dump(var_pw_emb_list , f)

# まとめてやる

In [248]:
import glob

In [206]:
%%time
# 埋め込み辞書作成
# TODO: repoを横断して集計
var_pw_emb_list = parse_var_emb_list(var_set)

CPU times: user 425 ms, sys: 17.8 ms, total: 443 ms
Wall time: 444 ms


In [250]:
_dir = './repos/*'
var_path_list = glob.glob(_dir)
var_path_list

['./repos/ZeroNet.data',
 './repos/fastText.data',
 './repos/python-fire.data',
 './repos/fairseq.data',
 './repos/cpython.data',
 './repos/typeshed.data',
 './repos/mypy.data',
 './repos/peps.data',
 './repos/pythondotorg.data',
 './repos/Real-Time-Voice-Cloning.data']

In [251]:
def read_parse_var_info_data(_dir):
    # ディレクトリ内を網羅でやる
    var_path_list = glob.glob(_dir)
    
    # embedding 類似度検索用
    all_var_pw_emb_list = []
    # 単語情報取得用
    word_repo_dict = {}
    for path in var_path_list:
        var_data = read_var_data(path)
        var_set = [var for var in var_data['var_set'] if is_var(var)]
        all_var_pw_emb_list += parse_var_emb_list(var_set)
        for var in var_set:
            word_repo_dict[var] = {k: v for k,v in var_data.items() if k != 'var_set'}
            
    return all_var_pw_emb_list, word_repo_dict

In [252]:
all_var_pw_emb_list, word_repo_dict = read_parse_var_info_data('./repos/*')

In [253]:
list(word_repo_dict.items())[:10]

[('relative_path_old',
  {'url': 'https://github.com/HelloZeroNet/ZeroNet',
   'repo_name': 'HelloZeroNet/ZeroNet',
   'star': '14491',
   'fork': '1865'}),
 ('andshake',
  {'url': 'https://github.com/HelloZeroNet/ZeroNet',
   'repo_name': 'HelloZeroNet/ZeroNet',
   'star': '14491',
   'fork': '1865'}),
 ('four',
  {'url': 'https://github.com/python/cpython',
   'repo_name': 'python/cpython',
   'star': '26274',
   'fork': '11291'}),
 ('ppend',
  {'url': 'https://github.com/python/cpython',
   'repo_name': 'python/cpython',
   'star': '26274',
   'fork': '11291'}),
 ('values',
  {'url': 'https://github.com/CorentinJ/Real-Time-Voice-Cloning',
   'repo_name': 'CorentinJ/Real-Time-Voice-Cloning',
   'star': '5390',
   'fork': '617'}),
 ('uff_pos',
  {'url': 'https://github.com/HelloZeroNet/ZeroNet',
   'repo_name': 'HelloZeroNet/ZeroNet',
   'star': '14491',
   'fork': '1865'}),
 ('pool',
  {'url': 'https://github.com/python/mypy',
   'repo_name': 'python/mypy',
   'star': '6622',
   'for

In [254]:
# 保存
import pickle
save = lambda obj, path: pickle.dump(obj, open(path, mode='wb'))
save(all_var_pw_emb_list, '../data/all_var_pw_emb_list.pickle')
save(word_repo_dict, '../data/word_repo_dict.pickle')

In [255]:
all_var_pw_emb_list[:5]

[{'vector': array([-0.00083656, -0.00046498, -0.01354802,  0.0091898 , -0.0760837 ,
         -0.01978865, -0.00455011, -0.18995216,  0.01090875, -0.07132135,
         -0.04185575, -0.09107251, -0.04749709,  0.01476773, -0.04089591,
          0.00729425,  0.17280568, -0.04284291,  0.16238046, -0.03819598,
          0.03815838,  0.03456533, -0.07557213,  0.09801693, -0.00764228,
          0.04304401,  0.01669951,  0.03586654,  0.11235207,  0.02254212,
         -0.00791374,  0.03661019, -0.01299069, -0.08596382, -0.04253822,
         -0.00361962,  0.06174072, -0.01352644, -0.01449645,  0.01269828,
          0.01735241, -0.20955132, -0.02894094, -0.03170837, -0.06764726,
          0.00769568, -0.0131916 , -0.02506624, -0.02881981, -0.00467964,
          0.00196381,  0.00707478, -0.05154604,  0.01916509, -0.04507638,
         -0.05280991,  0.02857426,  0.03045962, -0.10022493, -0.00961345,
          0.0551477 , -0.04039474,  0.26623362, -0.00501489,  0.10437815,
         -0.01367633,  0.012

# 検索

In [256]:
import numpy as np

def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def search_by_query(query, model):
    query_vector = get_word_list_vector(query, model)
    for var_dict in all_var_pw_emb_list:
        var_dict['cos_sim'] = cos_sim(query_vector, var_dict['vector'])
    sorted_sim_var_list = sorted(all_var_pw_emb_list, key=lambda x:  x['cos_sim'], reverse=True)
    return sorted_sim_var_list

In [216]:
# query = 'num of product'.split()
# query_vector = get_word_list_vector(query, model)
# %%time
# for var_dict in all_var_pw_emb_list:
#     var_dict['cos_sim'] = cos_sim(query_vector, var_dict['vector'])
# sorted_sim_var_list = sorted(all_var_pw_emb_list, key=lambda x:  x['cos_sim'], reverse=True)

In [218]:
import pandas as pd

In [257]:
query =  'number product'.split()
sorted_sim_var_list = search_by_query(query, model)
pd.DataFrame(sorted_sim_var_list)

  after removing the cwd from sys.path.


Unnamed: 0,cos_sim,parsed_var_list,var,vector
0,0.869931,[number],number,"[-0.0046659643, 0.001378062, 0.0030083028, -0...."
1,0.847055,"[version, number]",version_number,"[0.011273102, 0.01726185, 0.01168346, -0.01914..."
2,0.841794,"[article, number]",article_number,"[0.000337685, -0.009974821, 0.018035369, -0.01..."
3,0.823203,"[line, number]",line_number,"[-0.014219443, -0.0013533442, 0.045415442, -0...."
4,0.816579,[product],product,"[-0.0028900737, 0.0009795268, 0.0039173015, 0...."
5,0.813929,"[number, funcs]",number_funcs,"[0.015324566, -0.016607199, 0.013569811, -0.03..."
6,0.808882,"[base, version, number]",base_version_number,"[0.0056491904, 0.008593014, 0.0053828023, -0.0..."
7,0.795157,"[shlib, version, number]",shlib_version_number,"[0.0095664235, -0.017093195, 0.006096334, -0.0..."
8,0.790834,"[line, number, colors]",line_number_colors,"[0.00055183005, 0.008995685, 0.06563376, -0.01..."
9,0.786364,"[negative, number, matcher]",negative_number_matcher,"[0.03865565, 0.038182076, 0.020715414, -0.0328..."
