## Description
This file is used to check the distribution of words used in each project, and then compare the results in various ways. 

## Read Files
First we will read the result from previous work, in which we get python files from internet, parse them to get all the variable and function names (and their scope, might be useful later), and then we parse the variable and function names into terms (in a primitive way). 

In [1]:
import pandas as pd
import sqlite3
name_table = "NameTable"
conn = sqlite3.connect('data.db')
query = f"SELECT * FROM {name_table}"
df = pd.read_sql_query(query, conn)
import json
df['terms'] = df.terms.apply(json.loads)
df

KeyboardInterrupt: 

In [None]:
import nltk
# nltk.download('words')
# nltk.download('brown')

## Average Length of Names

Maybe we should save this back to dataframe

### Length by Letter

In [None]:
import re
def count_length_by_letter(name):
    # we don't count numbers and underscore
    pattern = r'[0-9_]'
    name = re.sub(pattern, '', name)
    return len(name)

df["lengthByLetter"] = df['name'].apply(count_length_by_letter)
df['lengthByWord'] = df['terms'].apply(len)
df = df[(df['lengthByLetter'] > 0) & (df['lengthByWord'] > 0)]

In [None]:
print("Average Length (by letter) of Chinese-speaking Programmers are: ")
print(df[df['authorLocation'] == 'China']['lengthByLetter'].mean())
print("Average Length (by letter) of English-speaking Programmers are: ")
print(df[df['authorLocation'] == 'USA']['lengthByLetter'].mean())

Let's add the brown's data. Frequency is calculated by (number of word with length k) / (number of words) for each k. 

In [None]:
from nltk.corpus import brown
# Get the words from the Brown Corpus
brown_words = brown.words()
# get length of brown words
total_words = len(brown_words)
# Calculate word frequencies
dic_brown_word_freq = nltk.FreqDist(brown_words)
# normalize the word frequency
dic_brown_word_freq = {word: freq / total_words for word, freq in dic_brown_word_freq.items()}
# put the normalized word frequency into pandas series 
df_brown_word_freq = pd.DataFrame.from_dict(dic_brown_word_freq, orient='index').reset_index()
df_brown_word_freq.columns = ['word', 'proportion']
# calculate length of letters frequency 
df_brown_word_freq['lengthByLetter'] = df_brown_word_freq['word'].apply(len)
df_brown_letter_len_freq = df_brown_word_freq.groupby('lengthByLetter')['proportion'].sum()

In [None]:

df_chinese_letter_len_freq = df[df['authorLocation'] == 'China']['lengthByLetter'].value_counts(normalize=True).sort_index()
df_english_letter_len_freq = df[df['authorLocation'] == 'USA']['lengthByLetter'].value_counts(normalize=True).sort_index()


In [None]:
import matplotlib.pyplot as plt
df_brown_letter_len_freq.plot(kind='bar', color='green', label='Brown')
df_chinese_letter_len_freq.plot(kind='bar', color='blue', label='China')
df_english_letter_len_freq.plot(kind='bar', color='red', label='USA')
# Adding labels and title
plt.xlabel('Number')
plt.ylabel('Relative Frequency')
plt.title('Normalized Frequency of Numbers')
plt.legend()
# show less x labels
plt.gca().xaxis.set_major_locator(plt.MaxNLocator(7))  # Adjust '5' to display the desired number of ticks

# Showing the plot
plt.show()

The cdf version

In [None]:
import matplotlib.pyplot as plt
# df_brown_letter_len_freq.cumsum().plot(kind='bar', color='green', label='Brown')
df_chinese_letter_len_freq.cumsum().plot(kind='bar', color='blue', label='China')
df_english_letter_len_freq.cumsum().plot(kind='bar', color='red', label='USA')
# Adding labels and title
plt.xlabel('Number')
plt.ylabel('Relative Frequency')
plt.title('Normalized Frequency of Numbers')
plt.legend()
# show less x labels
plt.gca().xaxis.set_major_locator(plt.MaxNLocator(7))  # Adjust '5' to display the desired number of ticks

# Showing the plot
plt.show()

Plot them separately

In [None]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(3)

df_brown_letter_len_freq.plot(kind='bar', color='green', label='Brown', ax=axs[0])
df_chinese_letter_len_freq.plot(kind='bar', color='blue', label='China', ax=axs[1])
df_english_letter_len_freq.plot(kind='bar', color='red', label='USA', alpha=0.7, position=0.5, ax=axs[2])


for ax in axs:
    # show legend
    ax.legend()
    # show less x labels
    ax.xaxis.set_major_locator(plt.MaxNLocator(7))  # Adjust '5' to display the desired number of ticks


# Showing the plot
plt.show()

In [None]:
df_brown_letter_len_freq = df_brown_letter_len_freq.cumsum()
df_brown_letter_len_freq

Let's see the CDF version

In [None]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(3)

df_brown_letter_len_freq.cumsum().plot(kind='bar', color='green', label='Brown', ax=axs[0])
df_chinese_letter_len_freq.cumsum().plot(kind='bar', color='blue', label='China', ax=axs[1])
df_english_letter_len_freq.cumsum().plot(kind='bar', color='red', label='USA', alpha=0.7, position=0.5, ax=axs[2])


for ax in axs:
    # show legend
    ax.legend()
    # show less x labels
    ax.xaxis.set_major_locator(plt.MaxNLocator(7))  # Adjust '5' to display the desired number of ticks


# Showing the plot
plt.show()



### Length by word

In [None]:
print("Average Length (by word) of Chinese-speaking Programmers are: ")
print(df[df['authorLocation'] == 'China']['lengthByWord'].mean())
print("Average Length (by word) of English-speaking Programmers are: ")
print(df[df['authorLocation'] == 'USA']['lengthByWord'].mean())

In [None]:
df_chinese_word_len_freq = df[df['authorLocation'] == 'China']['lengthByWord'].value_counts(normalize=True).sort_index()
df_english_word_len_freq = df[df['authorLocation'] == 'USA']['lengthByWord'].value_counts(normalize=True).sort_index()


In [None]:
df_chinese_word_len_freq.plot(kind='bar', color='blue', label='China')
df_english_word_len_freq.plot(kind='bar', color='red', label='USA', alpha=0.7, position=0.5)
# Adding labels and title
plt.xlabel('Number')
plt.ylabel('Relative Frequency')
plt.title('Normalized Frequency of Numbers')
plt.legend()
# Showing the plot
plt.show()

the cdf

In [None]:
df_chinese_word_len_freq.cumsum().plot(kind='bar', color='blue', label='China')
df_english_word_len_freq.cumsum().plot(kind='bar', color='red', label='USA', alpha=0.7, position=0.5)
# Adding labels and title
plt.xlabel('Number')
plt.ylabel('Relative Frequency')
plt.title('Normalized Frequency of Numbers')
plt.legend()
# Showing the plot
plt.show()

### Length by letter X by word

Here we are trying to recreate Nitssan's Figure 3

In [None]:
def plot_letterXword(df_language):
    # assume that the dataframe have both lengthByLetter and lengthByWord columns. 
    df_letterXword_len_freq = df_language.groupby(['lengthByLetter', 'lengthByWord']).size().reset_index(name='count')
    # pivot the dataframe
    pivot_df = df_letterXword_len_freq.pivot(index='lengthByLetter', columns='lengthByWord', values='count').fillna(0)
    # plot the dataframe
    pivot_df.plot(kind='bar', stacked=True)
    plt.xlabel('Length by Letter')
    plt.ylabel('Count')
    plt.title('Stacked Bar Plot of Length by Letter with Length by Word')
    plt.legend(title='Length by Word')
    plt.show()

df_chinese = df[df['authorLocation'] == 'China']

plot_letterXword(df_chinese)

In [None]:
df_english = df[df['authorLocation'] == 'USA']
plot_letterXword(df_english)


## Most Common Terms

Now let's see the number of occurrences for each word. I mean, let's first see the top 40. 

In [None]:
flattened_words_chinese = df[df['authorLocation'] == 'China']['terms'].explode()
# Count the occurrences of each word
word_counts_chinese = flattened_words_chinese.value_counts()

In [None]:
word_counts_chinese.head(40)

In [None]:
flattened_words_english = df[df['authorLocation'] == 'USA']['terms'].explode()
# Count the occurrences of each word
word_counts_english = flattened_words_english.value_counts()

In [None]:
word_counts_english.head(40)

todo: single letter names - states - are they the same ones? Need to wait until we have the full data

In [None]:
df_chinese_word_freq = pd.DataFrame(word_counts_chinese.reset_index())
df_chinese_word_freq.rename(columns={"count": "frequency"}, inplace=True)
# normalize word frequency
df_chinese_word_freq['frequency'] = df_chinese_word_freq['frequency'] / df_chinese_word_freq['frequency'].sum()

df_english_word_freq = pd.DataFrame(word_counts_english.reset_index())
df_english_word_freq.rename(columns={"count": "frequency"}, inplace=True)
df_english_word_freq['frequency'] = df_english_word_freq['frequency'] / df_english_word_freq['frequency'].sum()


## Zipf's Law

### Browns

Let's generate the original frequency of words: I will use the brown dataset from NLTK. Can also try to use COCA, it iw what Nitsan used

In [None]:
from nltk.corpus import brown
# Get the words from the Brown Corpus
brown_words = brown.words()
# get length of brown words
total_words = len(brown_words)
# Calculate word frequencies
dic_brown_word_freq = nltk.FreqDist(brown_words)
# normalize the word frequency
dic_brown_word_freq = {word: freq / total_words for word, freq in dic_brown_word_freq.items()}
# convert to df
df_brown_word_freq = pd.DataFrame.from_dict(dic_brown_word_freq, orient='index').reset_index()
df_brown_word_freq.rename(columns={'index':'terms', 0:'frequency'}, inplace=True)
df_brown_word_freq

### Rank by Full name
In the most common full terms section, we have ranked all the terms. But we also need the rank of all the full names. 
Now we need the frequency for names: We have parsed a name into terms, so we will define a name as lower cased terms concatenated by underscore. And then we rank the names by frequency. 

In [None]:
temp = df['terms'].apply('_'.join).str.lower()
# sometimes pandas returns the view of a df, so we need to make the copy. 
df = df.copy()
df['standarized_name'] = temp

Now we are going to take the standarize names and assign the frequency based on nationality. Notice that we have several frequency df right now: 
1. df_XXX_letter_len_freq: frequency of length (of name, count by letter)
2. df_XXX_word_len_freq: frequency of length (of name, count by word)
3. df_XXX_word_freq: frequency of word (seperate all the words and see each word's frequency)
4. df_XXX_std_name_freq: frequency of full name (lower case and connected by underscore)

In [None]:
series_chinese_std_name_freq = df[df['authorLocation'] == 'China']['standarized_name'].value_counts(normalize=True).sort_index()
df_chinese_std_name_freq = pd.DataFrame(series_chinese_std_name_freq.reset_index())
df_chinese_std_name_freq.rename(columns={"proportion": "frequency", 'standarized_name': 'terms'}, inplace=True)

series_english_std_name_freq = df[df['authorLocation'] == 'USA']['standarized_name'].value_counts(normalize=True).sort_index()
df_english_std_name_freq = pd.DataFrame(series_english_std_name_freq.reset_index())
df_english_std_name_freq.rename(columns={"proportion": "frequency", 'standarized_name': 'terms'}, inplace=True)

### Zipf law plot 1
Let's see the bron's frequency plot: seems like it accords to Zipf's Law

In [None]:
import matplotlib.pyplot as plt
def print_zipfs_law(df_word_freq, label):
    # assume the two columns are named "terms" and "frequency"
    df_word_freq['rank'] = df_word_freq['frequency'].rank(ascending=False)
    # for the same rank there might be more than one word
    df_zipf = df_word_freq.groupby('rank').agg({'frequency': 'mean'}).reset_index()

    plt.plot(df_zipf['rank'], df_zipf['frequency'], label=label)
    
    plt.yscale('log')  # Set y-axis to logarithmic scale
    plt.xscale('log')  # Set y-axis to logarithmic scale
    plt.xlabel('Rank of Word')
    plt.ylabel('Frequency')
    plt.title('Word Frequency Distribution')
    plt.legend()
    plt.grid(True)
print_zipfs_law(df_brown_word_freq, label='Brown')
print_zipfs_law(df_chinese_word_freq, label='China - Word')
print_zipfs_law(df_english_word_freq, label='USA - Word')
print_zipfs_law(df_chinese_std_name_freq, label='China - Name')
print_zipfs_law(df_english_std_name_freq, label='USA - Name')


todo: Try to the log of 2?

### Rank bags and length by letter

We know that the length of a word is related to the rank of the word, typically, the longer a word is, the less likely it will occur. Let's check if it is also true for programming languages for each language groups. 

In [None]:
def plot_rank_bag_to_letter_len(df_word_freq):
    # assume it have columns "terms" (abuse notation), and "frequency"
    df_word_freq['rank'] = df_word_freq['frequency'].rank(ascending=False)
    df_word_freq['lengthByLetter'] = df_word_freq['terms'].apply(len)
    # map rank to bins
    bins = [0, 10, 100, 1000, 10000, 100000]
    labels = ['1-10', '10-100', '100-1k', '1k-10k', '10k+']
    df_word_freq['rankBag'] = pd.cut(df_word_freq['rank'], bins=bins, labels=labels, right=False)
    # plot the boxplot for each bag: we don't show fliers
    df_word_freq.boxplot(column='lengthByLetter', by='rankBag', grid=False, patch_artist=True, showfliers=False)
    # Adding titles and labels
    plt.title('Box Plot of Name Length by Rank Bags')
    plt.suptitle('')
    plt.xlabel('Rank Bags')
    plt.ylabel('Name Length')
    plt.xticks(rotation=45)
    plt.show()
plot_rank_bag_to_letter_len(df_brown_word_freq)

In [None]:
plot_rank_bag_to_letter_len(df_chinese_word_freq)


In [None]:
plot_rank_bag_to_letter_len(df_english_word_freq)

In [None]:
plot_rank_bag_to_letter_len(df_chinese_std_name_freq)

In [None]:
plot_rank_bag_to_letter_len(df_english_std_name_freq)

Let's see the percentage: In Nitsan's paper, it is the rank of "whole name" vs. the number of words in a name. I don't understand this approach, so let's ask Dror this weekend. 
For now let's do percentage of length by letter according to rank. 

In [None]:
def plot_rank_bag_to_letter_len(df_word_freq):
    # assume it have columns "terms", and "frequency"
    df_word_freq['rank'] = df_word_freq['frequency'].rank(ascending=False)
    df_word_freq['lengthByLetter'] = df_word_freq['terms'].apply(len)
    # map rank to bins
    bins = [0, 10, 100, 1000, 10000, 100000]
    labels = ['1-10', '10-100', '100-1k', '1k-10k', '10k+']
    df_word_freq['rankBag'] = pd.cut(df_word_freq['rank'], bins=bins, labels=labels, right=False)
    # calculate for each rankBag, what is the number of each length number
    pivot_df = df_word_freq.pivot_table(index='rankBag', columns='lengthByLetter', aggfunc='size', fill_value=0)
    pivot_df = pivot_df.div(pivot_df.sum(axis=1), axis=0)
    # plot the boxplot for each bag: we don't show fliers
    pivot_df.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')
    # Adding titles and labels
    plt.title('Percentage of Name Length (by letter) by Rank Bags')
    plt.xlabel('Rank Bags')
    plt.ylabel('Name Length')
    plt.xticks(rotation=45)
    plt.legend(title='Length', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()
    
plot_rank_bag_to_letter_len(df_brown_word_freq)

In [None]:
plot_rank_bag_to_letter_len(df_chinese_word_freq)


In [None]:
plot_rank_bag_to_letter_len(df_english_word_freq)


In [None]:
plot_rank_bag_to_letter_len(df_chinese_std_name_freq)


In [None]:
plot_rank_bag_to_letter_len(df_english_std_name_freq)


### Rank bags and length by word

In [None]:
def plot_rank_bag_to_word_len(df_word_freq):
    # assume it have columns "terms" (abuse notation), and "frequency"
    df_word_freq['rank'] = df_word_freq['frequency'].rank(ascending=False)
    # we assume that we will only pass standarize name - words separated by one underscore. 
    df_word_freq['lengthByWord'] = df_word_freq['terms'].apply(lambda s: s.count('_') + 1)
    # map rank to bins
    bins = [0, 10, 100, 1000, 10000, 100000]
    labels = ['1-10', '10-100', '100-1k', '1k-10k', '10k+']
    df_word_freq['rankBag'] = pd.cut(df_word_freq['rank'], bins=bins, labels=labels, right=False)
    # calculate for each rankBag, what is the number of each length number
    pivot_df = df_word_freq.pivot_table(index='rankBag', columns='lengthByWord', aggfunc='size', fill_value=0)
    pivot_df = pivot_df.div(pivot_df.sum(axis=1), axis=0)
    # plot the boxplot for each bag: we don't show fliers
    pivot_df.plot(kind='bar', stacked=True, figsize=(10, 6))
    # Adding titles and labels
    plt.title('Percentage of Name Length (by letter) by Rank Bags')
    plt.xlabel('Rank Bags')
    plt.ylabel('Name Length')
    plt.xticks(rotation=45)
    plt.legend(title='Length', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()

In [None]:
plot_rank_bag_to_word_len(df_chinese_std_name_freq)

In [None]:
plot_rank_bag_to_word_len(df_english_std_name_freq)

## Vocabulary

### Number of Real Words

Let's check if the words are real words

In [None]:
from nltk.corpus import words

english_dictionary = set(words.words())


In [None]:
def lookup_terms(term):
    return term.lower() in english_dictionary

def percentage_of_real_word(df_word_frequency):
    # assuming that the df_word_frequency have "terms" column
    df_word_frequency['real_word'] = df_word_frequency['terms'].apply(lookup_terms)
    return df_word_frequency['real_word'].mean() * 100


In [None]:
print("Real Word Percentage in Chinese Projects:")
print(percentage_of_real_word(df_chinese_word_freq))
print("Real Word Percentage in English Projects:")
print(percentage_of_real_word(df_english_word_freq))
print("Real Word Percentage in Brown:")
print(percentage_of_real_word(df_brown_word_freq))


### Size of Vocabulary

In [None]:
print("The size of vocabulary of Chinese-speaking projects is")
print(flattened_words_chinese.nunique())
print("The size of vocabulary of English-speaking projects is")
print(flattened_words_english.nunique())
