In [1]:
import pandas as pd
from pyxdameraulevenshtein import damerau_levenshtein_distance as distance
from lookout.style.typos.analyzer import IdTyposAnalyzer

In [2]:
df = pd.read_csv("typos_dataset.csv.gz", header=None)
print("Number of samples is", df.shape[0])

Number of samples is 2471


In [3]:
df.columns = "id,correct_id,file,line,commit_hash,repo".split(",")
df.head()

Unnamed: 0,id,correct_id,file,line,commit_hash,repo
0,INERNET_TEST_URL,INTERNET_TEST_URL,run.py,264,74c662301e30c1a590dde50dbba5c17bab21c2ed,topless/gae-init
1,emplyeeList,employeeList,Websites/RuffCo-website/App_Code/DBHandlers/DB...,15,37c9210c421793d81cdb74f3c722ef817b2e9797,claytonr1/TeamMefford
2,includeKeword,includeKeyword,dcm4che-core/src/main/java/org/dcm4che3/io/XSL...,93,d99055a9ceb5a210b9591993f7d32ba7c1aa9626,dcm4che/dcm4che
3,typeAheadMinimumCharacterCount,typeAheadMinimumCharaceterCount,Frameworks/D2W/ERModernDirectToWeb/Sources/er/...,375,9ec14ca30d5792567bf73c945f38aa1028fed5c6,fbarthez/wonder
4,versionAvailabe,versionAvailable,src/ch/jbead/JBeadFrame.java,1187,eddb65c3e083334339a09d59fea78c3a105ba8c4,damianbrunold/jbead


In [4]:
deduplicated_df = df.drop_duplicates(subset=['id', 'correct_id'], keep="first")
print("Number of samples after deduplication", deduplicated_df.shape[0])

Number of samples after deduplication 2155


In [5]:
splitter = IdTyposAnalyzer.create_token_parser()

def check_line(line):
    identifier = line.id
    correct_id = line.correct_id
    tokens = list(splitter.split(identifier))
    corr_tokens = list(splitter.split(correct_id))
    if len(tokens) != len(corr_tokens):
        return "Number of subtokens is different"
    if not len(tokens):
        return "Identifier without alphabetic characters"
    res = []
    for t, ct in zip(tokens, corr_tokens):
        if distance(t, ct) > 2:
            res.append((t, ct))
    if res:
        return "Suspicious tokens %s" % res
    return ""

In [6]:
deduplicated_df["check"] = deduplicated_df.apply(check_line, axis=1)
diff_tokens = deduplicated_df[deduplicated_df["check"].str.startswith("Number")]
print("Number of samples with different number of tokens ", diff_tokens.shape[0])
suspicious_tokens = deduplicated_df[deduplicated_df["check"].str.startswith("Suspicious")]
print("Number of samples with big Demerau-Levenshtein distance", suspicious_tokens.shape[0])
no_alpha_tokens = deduplicated_df[deduplicated_df["check"].str.startswith("Identifier")]
print("Number of samples without alphabetic characters", no_alpha_tokens.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Number of samples with different number of tokens  203
Number of samples with big Demerau-Levenshtein distance 93
Number of samples without alphabetic characters 3


In [7]:
diff_tokens[["id", "correct_id", "check"]].head()

Unnamed: 0,id,correct_id,check
30,Zend_Filter_Htmlentities,Zend_Filter_HtmlEntities,Number of subtokens is different
58,getServerURl,getServerURL,Number of subtokens is different
65,startswith,startsWith,Number of subtokens is different
67,expected_to,to,Number of subtokens is different
68,createnewAppInstance,createNewAppInstance,Number of subtokens is different


In [8]:
suspicious_tokens[["id", "correct_id", "check"]].head()

Unnamed: 0,id,correct_id,check
36,should_be_two_failures,should_be_three_failures,"Suspicious tokens [('two', 'three')]"
71,longentropysel,entropysel,"Suspicious tokens [('longentropysel', 'entropy..."
80,LocalizedNameImpl,LocalizedURIImpl,"Suspicious tokens [('name', 'urii')]"
81,LocalizedNameMarshaller,LocalizedURIMarshaller,"Suspicious tokens [('name', 'urim')]"
82,LocalizedNameUnmarshaller,LocalizedURIUnmarshaller,"Suspicious tokens [('name', 'uriu')]"


In [9]:
no_alpha_tokens[["id", "correct_id", "check"]].head()

Unnamed: 0,id,correct_id,check
416,_72,_54,Identifier without alphabetic characters
417,_37,_10,Identifier without alphabetic characters
418,_51,_72,Identifier without alphabetic characters


In [10]:
to_save = deduplicated_df[deduplicated_df["check"] == ""]["id,correct_id,file,line,commit_hash,repo".split(",")]
print("Number of samples after deduplication and filtering", to_save.shape[0])
to_save.to_csv("typos_dataset_filtered_damerau_levenshtein.csv.gz", index=False, header=False, compression="gzip")

Number of samples after deduplication and filtering 1856
