# Description
In this Notebook, we will try to parse each names into dependency trees, and try to identify who ismodifier and who is modified. Then we will conclude that the percentage of use of MPM structure. 

# Read data
In order to save time, we will only load some part of the data. 

In [1]:
import pandas as pd
import sqlite3
name_table = "NameTable"
conn = sqlite3.connect('../ZipfLawAnalysis/data.db')
query = f"""SELECT *
FROM (
    SELECT *
    FROM {name_table}
    WHERE authorLocation = 'China'
    ORDER BY RANDOM()
    LIMIT 100000
) AS Chinese_sample
UNION ALL
SELECT *
FROM (
    SELECT *
    FROM {name_table}
    WHERE authorLocation = 'USA'
    ORDER BY RANDOM()
    LIMIT 100000
) AS USA_sample;"""
# takes 10 second to run 100k
df = pd.read_sql_query(query, conn)
import json
df['terms'] = df.terms.apply(json.loads)
df

Unnamed: 0,id,name,nameType,nameScope,projectSize,authorName,authorProficiency,authorLocation,terms,namingConvention
0,4120668,sphere_params,variable,FunctionScope,12269,yoursmengle,>100,China,"[sphere, params]",Snake
1,2966661,test_default_load_files,function,GlobalScope,63230,chenxingqiang,>100,China,"[test, default, load, files]",Snake
2,1947457,avg_kl_loss,variable,FunctionScope,346856,dyllanwli,50..100,China,"[avg, kl, loss]",Snake
3,3040385,all_zero_samples,variable,FunctionScope,58646,chenxingqiang,>100,China,"[all, zero, samples]",Snake
4,1623071,max,variable,FunctionScope,71173,wr786,50..100,China,[max],Unknown
...,...,...,...,...,...,...,...,...,...,...
199995,5853414,uwline,variable,FunctionScope,18573,brycepg,50..100,USA,[uwline],Unknown
199996,7259206,compare,variable,FunctionScope,36024,yask123,>100,USA,[compare],Unknown
199997,4896029,hits,variable,FunctionScope,21484,scottgs,<50,USA,[hits],Unknown
199998,8030290,waiter,variable,FunctionScope,66767,itamaro,>100,USA,[waiter],Unknown


For now convert abbreviation on the fly. There is a lot of problem regarding abbreviations, and I am just putting it off for now. Assume that the abbreviation map will map names to the correct names. 

In [2]:
abbrev_table = "AbbreviationMap"
query = f"SELECT * FROM {abbrev_table}"
df_abbrev_map = pd.read_sql_query(query, conn)

# I will use a better dictionary: ENABLE (Enhanced North American Benchmark Lexicon)
with open('../ZipfLawAnalysis/SavedFiles/atebits.txt', 'r') as file:
    words = file.read().splitlines()
english_dictionary =  set(words)

# the dictionary that maps abbreviation back to original words
abbrev_map = dict(zip(df_abbrev_map['term'], df_abbrev_map['abbrev_meaning']))
# because the confidence of preicting single letter is too low, I would give up all the single letters
# also there are ones that ChatGPT cannot recognize, generally too wierd ones, so I will get rid of those too. (277 of them)
# also there are about 20k duplicates due to capitalization, here we will combine them together first. ???
filtered_abbrev_map = {k: v for k, v in abbrev_map.items() if v != '-1'}

# function that checks if it's a real word
def lookup_terms(term):
    return term.lower() in english_dictionary

def map_terms_to_actual_terms(terms):
    # if it's dictionary word, it will not be in the dictionary, or it might be something that GPT cannot guess. 
    # either way, the original terms will be in the list. Else, the translated terms will be in the list.
    return [filtered_abbrev_map.get(term, term) for term in terms]

df['actual_terms'] = df['terms'].apply(map_terms_to_actual_terms)   

temp = df['terms'].apply('_'.join).str.lower()
df['standarized_name'] = temp

temp = df['actual_terms'].apply('_'.join).str.lower().str.replace(" ", "_")
df['atual_standarized_name'] = temp

# we use atual_standarized_name to define actual_terms so that we can get rid of the space
# sometimes pd will return view of df not the actual df, depends on the RAM
df = df.copy()
df['actual_terms'] = df['atual_standarized_name'].apply(lambda x: x.split('_'))

In [3]:
df

Unnamed: 0,id,name,nameType,nameScope,projectSize,authorName,authorProficiency,authorLocation,terms,namingConvention,actual_terms,standarized_name,atual_standarized_name
0,4120668,sphere_params,variable,FunctionScope,12269,yoursmengle,>100,China,"[sphere, params]",Snake,"[sphere, parameters]",sphere_params,sphere_parameters
1,2966661,test_default_load_files,function,GlobalScope,63230,chenxingqiang,>100,China,"[test, default, load, files]",Snake,"[test, default, load, files]",test_default_load_files,test_default_load_files
2,1947457,avg_kl_loss,variable,FunctionScope,346856,dyllanwli,50..100,China,"[avg, kl, loss]",Snake,"[average, kullback-leibler, loss]",avg_kl_loss,average_kullback-leibler_loss
3,3040385,all_zero_samples,variable,FunctionScope,58646,chenxingqiang,>100,China,"[all, zero, samples]",Snake,"[all, zero, samples]",all_zero_samples,all_zero_samples
4,1623071,max,variable,FunctionScope,71173,wr786,50..100,China,[max],Unknown,[max],max,max
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,5853414,uwline,variable,FunctionScope,18573,brycepg,50..100,USA,[uwline],Unknown,"[universal, windows, line]",uwline,universal_windows_line
199996,7259206,compare,variable,FunctionScope,36024,yask123,>100,USA,[compare],Unknown,[compare],compare,compare
199997,4896029,hits,variable,FunctionScope,21484,scottgs,<50,USA,[hits],Unknown,[hits],hits,hits
199998,8030290,waiter,variable,FunctionScope,66767,itamaro,>100,USA,[waiter],Unknown,[waiter],waiter,waiter


# Dependency Parsing
## Example

In [4]:
# !pip install spacy
# !python -m spacy download en
import spacy

# Load the English NLP model
nlp = spacy.load('en_core_web_sm')


In [17]:

# Example: Process a sentence
doc = nlp("the dog is eating hot dog.")

# Print the dependency parsing results
for token in doc:
    print(f'{token.text:{10}}{token.i:{10}} {token.dep_:{10}} {token.head.text:{10}} {token.head.i:{10}}')

the                0 det        dog                 1
dog                1 nsubj      eating              3
is                 2 aux        eating              3
eating             3 ROOT       eating              3
hot                4 amod       dog                 5
dog                5 dobj       eating              3
.                  6 punct      eating              3


In [18]:
from spacy import displacy
displacy.render(doc, style="dep", page="true")

## perform on names

In [7]:
df_phrase = df[df["actual_terms"].str.len() > 2].copy()
df_phrase["phrase"] = df_phrase["actual_terms"].apply(" ".join)
df_phrase

Unnamed: 0,id,name,nameType,nameScope,projectSize,authorName,authorProficiency,authorLocation,terms,namingConvention,actual_terms,standarized_name,atual_standarized_name,phrase
1,2966661,test_default_load_files,function,GlobalScope,63230,chenxingqiang,>100,China,"[test, default, load, files]",Snake,"[test, default, load, files]",test_default_load_files,test_default_load_files,test default load files
2,1947457,avg_kl_loss,variable,FunctionScope,346856,dyllanwli,50..100,China,"[avg, kl, loss]",Snake,"[average, kullback-leibler, loss]",avg_kl_loss,average_kullback-leibler_loss,average kullback-leibler loss
3,3040385,all_zero_samples,variable,FunctionScope,58646,chenxingqiang,>100,China,"[all, zero, samples]",Snake,"[all, zero, samples]",all_zero_samples,all_zero_samples,all zero samples
7,2117835,parse_content_disposition,function,GlobalScope,11480,asapsonter,50..100,China,"[parse, content, disposition]",Snake,"[parse, content, disposition]",parse_content_disposition,parse_content_disposition,parse content disposition
8,2035907,_is_platform_dependent,function,FunctionScope,35987,asapsonter,50..100,China,"[is, platform, dependent]",Snake,"[is, platform, dependent]",is_platform_dependent,is_platform_dependent,is platform dependent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199983,5944068,idna_info,variable,FunctionScope,7142,hang10z,50..100,USA,"[idna, info]",Snake,"[internationalized, domain, names, in, applica...",idna_info,internationalized_domain_names_in_applications...,internationalized domain names in applications...
199985,6260123,test_wide_repr_wide_columns,function,FunctionScope,53314,GautamGottipati,50..100,USA,"[test, wide, repr, wide, columns]",Snake,"[test, wide, representation, wide, columns]",test_wide_repr_wide_columns,test_wide_representation_wide_columns,test wide representation wide columns
199992,7595055,fcompiler_class,variable,GlobalScope,20010,kdschlosser,>100,USA,"[fcompiler, class]",Snake,"[fortran, compiler, class]",fcompiler_class,fortran_compiler_class,fortran compiler class
199995,5853414,uwline,variable,FunctionScope,18573,brycepg,50..100,USA,[uwline],Unknown,"[universal, windows, line]",uwline,universal_windows_line,universal windows line


In [8]:
modifiers = {'AMOD', 'ADVCL', 'ACL', 'ADVMOD', 'APPOS', 'COMPOUND', 'META', 'NEG', 'NPMOD', 'POSS', 'PREP', 'RELCL', 'POBJ'}
def count_MPM(phrase):
    '''
    This function will count number of MPM in the phrase, and number of non MPM. Used for df.apply()
    Why do we count it? 
    1. Some phrases doesn't have a MPM structure: evaluate_data
    2. Some phrases have some MPM and some not: starting_time_of_frame
    '''
    num_mpm, num_not_mpm = 0, 0
    doc = nlp(phrase)
    for token in doc:
        if token.dep_.upper() in modifiers:
            if token.i < token.head.i:
                num_mpm += 1
            else:
                num_not_mpm += 1
    return num_mpm, num_not_mpm

temp = df_phrase["phrase"].apply(count_MPM)
df_phrase.reset_index(drop=True, inplace=True)
df_phrase[["num_MPM", "num_not_MPM"]] = pd.DataFrame(temp.to_list(), columns=["num_MPM", "num_not_MPM"]).copy()

In [9]:
# get the chinese subset and USA subset
df_china_phrase = df_phrase[df_phrase['authorLocation'] == 'China']
df_usa_phrase = df_phrase[df_phrase['authorLocation'] == 'USA']

for x, df_x_phrase in [('China', df_china_phrase), ('USA',df_usa_phrase)]:
    num_mpm = df_x_phrase['num_MPM'].sum()
    num_non_mpm = df_x_phrase['num_not_MPM'].sum()
    total_sample = num_mpm + num_non_mpm
    percentage = num_mpm/total_sample
    print(x)
    print(f"MPM: {num_mpm: <15}non_mpm: {num_non_mpm: <15} total_sample:{total_sample:<15} percentage:{percentage: <15} ")

China
MPM: 46616          non_mpm: 9111            total_sample:55727           percentage:0.8365065408150447 
USA
MPM: 54182          non_mpm: 10010           total_sample:64192           percentage:0.8440615653040877 


The result seems not great, How about the raw terms?  (replicate/override above results)

In [10]:
df_phrase = df[df["terms"].str.len() > 2].copy()
df_phrase["phrase"] = df_phrase["terms"].apply(" ".join)
modifiers = {'AMOD', 'ADVCL', 'ACL', 'ADVMOD', 'APPOS', 'COMPOUND', 'META', 'NEG', 'NPMOD', 'POSS', 'PREP', 'RELCL', 'POBJ'}
def count_MPM(phrase):
    '''
    This function will count number of MPM in the phrase, and number of non MPM. Used for df.apply()
    Why do we count it? 
    1. Some phrases doesn't have a MPM structure: evaluate_data
    2. Some phrases have some MPM and some not: starting_time_of_frame
    '''
    num_mpm, num_not_mpm = 0, 0
    doc = nlp(phrase)
    for token in doc:
        if token.dep_.upper() in modifiers:
            if token.i < token.head.i:
                num_mpm += 1
            else:
                num_not_mpm += 1
    return num_mpm, num_not_mpm

temp = df_phrase["phrase"].apply(count_MPM)
df_phrase.reset_index(drop=True, inplace=True)
df_phrase[["num_MPM", "num_not_MPM"]] = pd.DataFrame(temp.to_list(), columns=["num_MPM", "num_not_MPM"]).copy()
# get the chinese subset and USA subset
df_china_phrase = df_phrase[df_phrase['authorLocation'] == 'China']
df_usa_phrase = df_phrase[df_phrase['authorLocation'] == 'USA']

for x, df_x_phrase in [('China', df_china_phrase), ('USA',df_usa_phrase)]:
    num_mpm = df_x_phrase['num_MPM'].sum()
    num_non_mpm = df_x_phrase['num_not_MPM'].sum()
    total_sample = num_mpm + num_non_mpm
    percentage = num_mpm/total_sample
    print(x)
    print(f"MPM: {num_mpm: <15}non_mpm: {num_non_mpm: <15} total_sample:{total_sample:<15} percentage:{percentage: <15} ")

China
MPM: 27597          non_mpm: 5727            total_sample:33324           percentage:0.8281418797263234 
USA
MPM: 34239          non_mpm: 5322            total_sample:39561           percentage:0.8654735724577235 
