In [10]:
%matplotlib inline
import matplotlib.pyplot as plt

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import json
from tqdm import tqdm as tq
import pickle

In [11]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim import models

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
stemmer = PorterStemmer()
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
      if (token not in gensim.parsing.preprocessing.STOPWORDS and len(token)) > 3:
        result.append(lemmatize_stemming(token))
    
    return result

In [13]:
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
def normalizeString(s):
    # s = unicodeToAscii(s.lower().strip())
    s = unicodeToAscii(s.strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s

In [14]:
def parse_header(header):
    header = normalizeString(header)
    token_list = []
    for token in header.split(' '):
        token_list += camel_case_split(token)
    # token_list = set(token_list)
    norm_list = []
    for token in token_list:
        norm_list.append(lemmatize_stemming(token))
    header = ' '.join(norm_list).lower()
    return header

In [15]:
def get_pairs(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    print("reading ... ")
    pairs=[]
    index = 1
    for i in tq(range(len(lines))):
        l=lines[i]
        tmp1=parse_header(json.loads(l)["simple"])
        tmp2=parse_header(json.loads(l)["comment"])
#         tmp2=json.loads(l)["nl"]
        tmp3=float(json.loads(l)["coherence"])
        tmp5=json.loads(l)["code"]
        
        if len(tmp1.split()) < 351 and len(tmp2.split()) < 351 and len(tmp1.split()) > 2 and len(tmp2.split()) > 2:
            pairs.append([tmp1,tmp2,tmp3,index,tmp5])
        else:
            continue
        index += 1
    return pairs

In [19]:
test_pairs_nobug = get_pairs("ast_test_nobug.json")

  0%|          | 0/2314 [00:00<?, ?it/s]

reading ... 


100%|██████████| 2314/2314 [00:06<00:00, 343.53it/s]


In [20]:
test_pairs_bug = get_pairs("ast_test_bug.json")

100%|██████████| 27/27 [00:00<00:00, 322.11it/s]

reading ... 





In [23]:
with open("test_pairs_nobug.pkl", "wb") as f:
    pickle.dump(test_pairs_nobug, f)

In [24]:
with open("test_pairs_bug.pkl", "wb") as f:
    pickle.dump(test_pairs_bug, f)