# Description
In this file we will examine the percentage of plural and non plural. Also the precentage of usage of tenses. 

# Read data
In order to save time, we will only load some part of the data. 

In [1]:
import pandas as pd
import sqlite3
name_table = "NameTable"
conn = sqlite3.connect('../ZipfLawAnalysis/data.db')
query = f"""SELECT *
FROM (
    SELECT *
    FROM {name_table}
    WHERE authorLocation = 'China'
    ORDER BY RANDOM()
    LIMIT 100000
) AS Chinese_sample
UNION ALL
SELECT *
FROM (
    SELECT *
    FROM {name_table}
    WHERE authorLocation = 'USA'
    ORDER BY RANDOM()
    LIMIT 100000
) AS USA_sample;"""
# takes 10 second to run 100k
df = pd.read_sql_query(query, conn)
import json
df['terms'] = df.terms.apply(json.loads)


In [2]:
abbrev_table = "AbbreviationMap"
query = f"SELECT * FROM {abbrev_table}"
df_abbrev_map = pd.read_sql_query(query, conn)

# I will use a better dictionary: ENABLE (Enhanced North American Benchmark Lexicon)
with open('../ZipfLawAnalysis/SavedFiles/atebits.txt', 'r') as file:
    words = file.read().splitlines()
english_dictionary =  set(words)

# the dictionary that maps abbreviation back to original words
abbrev_map = dict(zip(df_abbrev_map['term'], df_abbrev_map['abbrev_meaning']))
# because the confidence of preicting single letter is too low, I would give up all the single letters
# also there are ones that ChatGPT cannot recognize, generally too wierd ones, so I will get rid of those too. (277 of them)
# also there are about 20k duplicates due to capitalization, here we will combine them together first. ???
filtered_abbrev_map = {k: v for k, v in abbrev_map.items() if v != '-1'}

# function that checks if it's a real word
def lookup_terms(term):
    return term.lower() in english_dictionary

def map_terms_to_actual_terms(terms):
    # if it's dictionary word, it will not be in the dictionary, or it might be something that GPT cannot guess. 
    # either way, the original terms will be in the list. Else, the translated terms will be in the list.
    return [filtered_abbrev_map.get(term, term) for term in terms]

df['actual_terms'] = df['terms'].apply(map_terms_to_actual_terms)   

temp = df['terms'].apply('_'.join).str.lower()
df['standarized_name'] = temp

temp = df['actual_terms'].apply('_'.join).str.lower().str.replace(" ", "_")
df['atual_standarized_name'] = temp

# we use atual_standarized_name to define actual_terms so that we can get rid of the space
# sometimes pd will return view of df not the actual df, depends on the RAM
df = df.copy()
df['actual_terms'] = df['atual_standarized_name'].apply(lambda x: x.split('_'))

# phrase is basically the atual_standarized_name but connect by space
df["phrase"] = df["actual_terms"].apply(" ".join)
df

Unnamed: 0,id,name,nameType,nameScope,projectSize,authorName,authorProficiency,authorLocation,terms,namingConvention,actual_terms,standarized_name,atual_standarized_name,phrase
0,1455146,test_shadowing_for_tuple_1,function,FunctionScope,71103,wr786,50..100,China,"[test, shadowing, for, tuple]",Snake,"[test, shadowing, for, tuple]",test_shadowing_for_tuple,test_shadowing_for_tuple,test shadowing for tuple
1,3334420,test_constant_strategy_regressor,function,GlobalScope,105670,zhongmicai,>100,China,"[test, constant, strategy, regressor]",Snake,"[test, constant, strategy, regressor]",test_constant_strategy_regressor,test_constant_strategy_regressor,test constant strategy regressor
2,3953784,get_statement_hint_text,function,FunctionScope,11199,chenlongzhen,>100,China,"[get, statement, hint, text]",Snake,"[get, statement, hint, text]",get_statement_hint_text,get_statement_hint_text,get statement hint text
3,3113064,_set_interrupt,function,FunctionScope,59359,holynova,>100,China,"[set, interrupt]",Snake,"[set, interrupt]",set_interrupt,set_interrupt,set interrupt
4,36587,tempdir,function,FunctionScope,91720,brightmart,<50,China,[tempdir],Unknown,"[temporary, directory]",tempdir,temporary_directory,temporary directory
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,5550380,end,variable,FunctionScope,14462,vivekaxl,50..100,USA,[end],Unknown,[end],end,end,end
199996,5593790,__radd__,function,FunctionScope,14462,vivekaxl,50..100,USA,[radd],Snake,"[right, add]",radd,right_add,right add
199997,6758081,culprit,variable,GlobalScope,7510,asweigart,>100,USA,[culprit],Unknown,[culprit],culprit,culprit,culprit
199998,4237553,prob,variable,FunctionScope,3239,dqwang122,<50,USA,[prob],Unknown,[prob],prob,prob,prob


# Plural and Tense Detection
We are using the princeton wordnet instead of the NLTK wordnet? Let's use the NLTK one for now
But we are suppose to use the NLTK for pos tagging (We could also use spacy as in MPM.ipynb)

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')


In [4]:
def pos_tagging(sentence):
    # Tokenize the sentence into words
    words = word_tokenize(sentence)
    # Perform POS tagging on the tokenized words
    pos_tags = pos_tag(words)
    return pos_tags


df['pos_tag'] = df["phrase"].apply(pos_tagging)



In [5]:
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def count_plural_and_tense(list_word_pos):
    plural_count, tense_count = 0, 0
    num_n, num_v = 0, 0
    for word, pos in list_word_pos:
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        singular = wordnet.morphy(word, pos=wordnet_pos)
        if singular is not None and singular != word:
            # if it's a noun, then it's plural, if it's a verb, then it's different tense
            if wordnet_pos == wordnet.NOUN:
                plural_count += 1
                # print("noun", word, singular) # for debug
            if wordnet_pos == wordnet.VERB:
                tense_count += 1
                # print('verb', word, singular) # for debug
        if wordnet_pos == wordnet.NOUN:
            num_n += 1
        if wordnet_pos == wordnet.VERB:
            num_v += 1

    return plural_count, num_n, tense_count, num_v
        
temp = df['pos_tag'].apply(count_plural_and_tense)


In [6]:
df[["num_plural", "total_n", "num_tense", "total_v"]] = pd.DataFrame(temp.to_list(), columns=["num_plural", "total_n", "num_tense", "total_v"]).copy()

In [7]:
# get the chinese subset and USA subset
df_china = df[df['authorLocation'] == 'China']
df_usa = df[df['authorLocation'] == 'USA']

for x, df_x in [('China', df_china), ('USA',df_usa)]:
    num_plural = df_x['num_plural'].sum()
    num_tense = df_x['num_tense'].sum()
    total_n = df_x['total_n'].sum()
    total_v = df_x['total_v'].sum()
    percentage_plural = num_plural/total_n
    percentage_tense = num_tense/total_v
    print(x)
    print(f"num_plural: {num_plural: <15} total_n: {total_n:<15} percentage_plural: {percentage_plural: <15}")
    print(f"num_tense: {num_tense:<15} total_v: {total_v:<15} percentage:{percentage_tense: <15} ")



China
num_plural: 14671           total_n: 166769          percentage_plural: 0.08797198520108654
num_tense: 7487            total_v: 17161           percentage:0.4362799370666045 
USA
num_plural: 15120           total_n: 171566          percentage_plural: 0.08812934963804017
num_tense: 8333            total_v: 17876           percentage:0.46615573953904677 


Okay.... Seems pretty bad, lets see the original terms....

In [8]:
df["phrase"] = df["terms"].apply(" ".join)
df['pos_tag'] = df["phrase"].apply(pos_tagging)
temp = df['pos_tag'].apply(count_plural_and_tense)
df[["num_plural", "total_n", "num_tense", "total_v"]] = pd.DataFrame(temp.to_list(), columns=["num_plural", "total_n", "num_tense", "total_v"]).copy()# get the chinese subset and USA subset
df_china = df[df['authorLocation'] == 'China']
df_usa = df[df['authorLocation'] == 'USA']

for x, df_x in [('China', df_china), ('USA',df_usa)]:
    num_plural = df_x['num_plural'].sum()
    num_tense = df_x['num_tense'].sum()
    total_n = df_x['total_n'].sum()
    total_v = df_x['total_v'].sum()
    percentage_plural = num_plural/total_n
    percentage_tense = num_tense/total_v
    print(x)
    print(f"num_plural: {num_plural: <15} total_n: {total_n:<15} percentage_plural: {percentage_plural: <15}")
    print(f"num_tense: {num_tense:<15} total_v: {total_v:<15} percentage:{percentage_tense: <15} ")


China
num_plural: 10617           total_n: 152583          percentage_plural: 0.0695818013802324
num_tense: 4833            total_v: 12366           percentage:0.39082969432314413 
USA
num_plural: 10144           total_n: 156170          percentage_plural: 0.064954856886726
num_tense: 4990            total_v: 11657           percentage:0.42806897143347344 
