# Generating suggestions for writing source code in C# language based on NLP.


## N-Gram approach

#### This notebook was created and adapted for the work of generating suggestions using some ideas and codes as reference the notebook of the author "Mangeshkar, Saurav" available at: 
https://www.kaggle.com/sauravmaheshkar/auto-completion-using-n-gram-models

### Import libraries

In [None]:
import os
import ntpath   
from chardet import detect
import nltk
import re
import h5py
import numpy as np
from toolz import unique
import pickle
import pandas as pd
import csv

nltk.download('punkt')

### Generic functions

In [2]:
def export_list_to_data_file(data, file_name):
    """
    Description: Function to export data to data file.
    :param data: Data to export,
    :param file_name: File name to export.
    
    :return: Void.
    """

    with open(file_name, 'wb') as filehandle:
        pickle.dump(data, filehandle)

In [3]:
def load_from_data_file(file_name):
    """
    Description: Function to load data from file.
    :param file_name: file name to load data from.
    
    :return: Type(list): List with data loaded from file.
    """
    
    data = []

    with open(file_name, 'rb') as filehandle:
        data = pickle.load(filehandle)

    return data

In [4]:
def print_info(title, message = None, new_line = False):
    """
    Description: Function to print info on screen.
    :param title: Message title,
    :param message: Message to print,
    :param new_line: Indicates whether the first message will start with a line break or not.
    
    :return: Void.
    """
    
    if new_line:
        print('\n')
    
    print("####################################")
    print(title)
    print("####################################")
    
    if message:
        print("%s\n" % (message))

In [5]:
def get_sequence_of_numbers_from_string(str):
    """
    Description: Function to extract all the sequence of numbers from the given string.
    :param str: String to extract sequence of numbers.
    
    :return: Type(list): List with sequence of numbers.
    """
    
    array_numbers = re.findall(r'[0-9]+', str)
    
    return array_numbers

In [6]:
def replace_sequence_of_numbers_for_mask(str_to_replace, 
                                         array_sequence_numbers_to_search, 
                                         mask_to_replace):
    """
    Description: Function to replace sequence of numbers for specific mask.
    :param str_to_replace: String to replace sequence of numbers,
    :param array_sequence_numbers_to_search: Sequence numbers to search for,
    :param mask_to_replace: Mask to replace each sequence.
    
    :return: Type(String): String with sequence of numbers replaced by mask.
    """
    
    for number_sequence in array_sequence_numbers_to_search:
        str_to_replace = re.sub(str(number_sequence), mask_to_replace, str_to_replace, 1)

    return str_to_replace

In [7]:
def get_encoding_type(file_path):
    """
    Description: Function to retrieve enconding type of file.
    :param file_path: File to get enconding.
    
    :return: Type(String): String with enconding type of file.
    """
        
    with open(file_path, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

In [8]:
def change_enconding(source_file, enconding):
    """
    Description: Function to change enconding of file.
    :param source_file: File path to change enconding,
    :param enconding: Enconding to replace in source_file.
    
    :return: Void.
    """
    
    from_codec = get_encoding_type(source_file)
    
    try: 
        target_file = source_file.replace(ntpath.basename(source_file), "123%s" % (ntpath.basename(source_file))) 
        
        with open(source_file, 'r', encoding=from_codec) as f, open(target_file, 'w', encoding=enconding) as e:
                text = f.read()
                e.write(text)
                f.close()

        os.remove(source_file) 
        os.rename(target_file, source_file) 
        
    except UnicodeDecodeError:
        print("Decode error for file: '%s'" % (source_file))
    except UnicodeEncodeError:
        print("Encode error for file: '%s'" % (source_file))

In [9]:
def flatten_list(list_to_flatten):
    """
    Description: Function to flatten the given list.
    :param list_to_flatten: List to flatten.
    
    :return: Type(List): Flat list.
    """   
    
    return [f for child_list in list_to_flatten for f in child_list]

In [10]:
def remove_duplicate_items_from_list(list_to_remove_duplicates):
    """
    Description: Function to remove duplicate itens from given list.
    :param list_to_remove_duplicates: List to remove duplicates.
    
    :return: Type(List): List without duplicates.
    """  
    
    return list(map(list, unique(map(tuple, list_to_remove_duplicates))))

In [11]:
def load_csv_to_test_previous_tokens_list(csv_file_path):
    """
    Description: Function to load data from csv file to previous tokens list.
    :param csv_file_path: Csv file path with data for previous tokens .
    
    :return: Type(List): List with previous tokens.
    """
        
    dataframe_tokens_test = pd.read_csv(csv_file_path, delimiter=";", header=None)

    previous_tokens_list = []

    for index, row in dataframe_tokens_test.iterrows():
        token = ""
        for column in range(len(dataframe_tokens_test.columns.tolist())):
            if type(row[column]) == str:
                token += row[column] + " "

        previous_tokens_list.append(token[:-1])

    return previous_tokens_list

In [12]:
def save_suggestions_to_csv_file(csv_file_path, all_suggestions):
    """
    Description: Function to save suggestions into csv file.
    :param csv_file_path: Csv file path to save suggestions,
    :param all_suggestions: List with all sugestions to save in csv file.
    
    :return: Void.
    """
    
    data_to_save_in_csv = []

    for suggestions in all_suggestions:
        for i in range(len(suggestions[1])):
            data_to_save_in_csv.append([suggestions[0], suggestions[1][i]])

    with open(csv_file_path, 'w', newline='') as f:     
        write = csv.writer(f, delimiter=";")      
        write.writerows(data_to_save_in_csv)

### C# repository functions

In [13]:
def get_all_c_sharp_complete_file_names_for_each_class(root_directory):
    """
    Description: Function to get all complete name of files with extension ".cs" (C# class).
    :param root_directory: Root directory of files.
    
    :return: Type(List): List with all file names of C# repository.
    """
    
    C_SHARP_CLASS_FILE_EXTENSION = ".cs"
    
    complete_name_of_files = []

    for root, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith(C_SHARP_CLASS_FILE_EXTENSION):
                complete_name_of_files.append(os.path.join(root, file))
    
    return complete_name_of_files

In [14]:
def get_content_for_each_file(complete_name_of_files):
    """
    Description: Function to get content of each source code file.
    :param complete_name_of_files: List with name of each file downladed from repository.
    
    :return: Type(List): Corpus with all C# source code.
    """
    
    c_sharp_code_corpus = []

    for file_name in complete_name_of_files:
        try:
            with open(file_name, "r", encoding="utf8", errors='ignore') as physical_file:
                c_sharp_code_corpus.append(physical_file.read())
                physical_file.close()
        except:
            change_enconding(file_name)
            
    return c_sharp_code_corpus

### Pre-processing functions

In [15]:
def preprocess_code_to_tokens(source_code):
    """
    Description: Function to make pre-processing in source code and tokenize words.
    :param source_code: Source code to pre-processing.
    
    :return: Type(List): List of tokens.
    """
    
    MASK_NUMBERS = "|mask_number|"
    
    code_sentences = source_code.split('\n')
    
    code_sentences = [c.strip() for c in code_sentences]
    
    code_sentences = [c for c in code_sentences if len(c) > 0]
    
    tokens = []
    
    for piece_of_code  in code_sentences:
        token = nltk.word_tokenize(piece_of_code)

        for i in range(len(token)):
            token[i] = replace_sequence_of_numbers_for_mask(
                            token[i],
                            get_sequence_of_numbers_from_string(token[i]),
                            MASK_NUMBERS)
            
        tokens.append(token)
        
    return tokens

In [16]:
def tokenize_all_files(c_sharp_code_corpus, first_x_corpus = 0):
    """
    Description: Function to tokenize all files.
    :param c_sharp_code_corpus: Complete list of C# corpus (Source code).
    :param first_x_corpus: Option to tokenize only the first X elements. Default: 0 - Tokenize all files.
    
    :return: Type(List): List of tokens.
    """
    
    tokens = []
    
    corpus_copy = c_sharp_code_corpus[:]
    
    if first_x_corpus > 0:
        corpus_copy = corpus_copy[:first_x_corpus]
        
    for corpus in corpus_copy:
        tokens.append(preprocess_code_to_tokens(corpus))

    return tokens

### N-gram functions.

In [17]:
def count_the_words_for_code(code_tokens):
    """
    Description: Function to count words for source codes.
    :param code_tokens: Tokens of all source repository.

    :return: Type(Dictionary): Dictionary with words count { Key - "Word", Value = Count }.
    """
    
    code_counts = {}

    for code_token in code_tokens: 
        for token in code_token:
            for token_aux in token:
                if token_aux not in code_counts.keys():
                    code_counts[token_aux] = 1
                else:
                    code_counts[token_aux] += 1 
            
    return code_counts

In [18]:
def handling_out_of_code_vocabulary(tokens, count_threshold):
    """
    Description: Function to create a dictionary of words (piece of code) that are not present in
    current corpus.
    :param tokens: List of tokens.
    :param count_threshold: Limit of words to add in closed dictionary.
    
    :return: Type(List): Closed vocabulary.
    """
        
    closed_vocabulary = []

    words_count = count_the_words_for_code(tokens)
    
    for word, count in words_count.items():
        if count >= count_threshold :
            closed_vocabulary.append(word)

    return closed_vocabulary

In [19]:
def unknown_tokenize(tokens, vocabulary, unknown_token = "<unk>"):
    """
    Description: Function to append list of tokens with unknown words (piece of code).
    :param tokens: List of tokens,
    :param vocabulary: Vocabulary of code,
    :param unknown_token: Unknown token. Default: <unk>
    
    :return: Type(List): List of tokens with new unknown tokens.
    """
    
    vocabulary = set(vocabulary)
    
    new_tokenized_sentences = []
    
    for sentence in tokens:
        new_sentence = []
        
        for token in sentence:
            for token_aux in token:
                if token_aux in vocabulary:
                    new_sentence.append(token_aux)
                else:
                    new_sentence.append(unknown_token)

        new_tokenized_sentences.append(new_sentence)
    
    return new_tokenized_sentences

In [20]:
def processing_vocabulary_and_unknown(tokens, count_threshold):
    """
    Description: Function to process vocabulary and unknown tokens.
    :param tokens: List of tokens,
    :param count_threshold: Limit do define wether some word is unknown or not.
    
    :return: Type(List, List): Tokens list and Vocabulary list.
    """
    
    tokens_aux = tokens[:]

    vocabulary = handling_out_of_code_vocabulary(tokens_aux, count_threshold)

    new_token_data = unknown_tokenize(tokens_aux, vocabulary)

    return new_token_data, vocabulary

In [21]:
def count_n_grams(tokens, ngrams_number, start_token_delimiter = "<s>", end_token_delimiter = "<e>"):
    """
    Description: Function to count n-grams.
    :param tokens: List of tokens,
    :param ngrams_number: Number of n-grams,
    :param start_token_delimiter: Start token delimiter,
    :param end_token_delimiter: End token delimiter.
    
    :return: Type(Dictionary): Dictionary with n-grams.
    """
    
    n_grams = {}

    for sentence in tokens:
        sentence = [start_token_delimiter]*ngrams_number + sentence + [end_token_delimiter]

        sentence = tuple(sentence)

        m = len(sentence) if ngrams_number==1 else len(sentence)-1

        for i in range(m):
            n_gram = sentence[i:i+ngrams_number]

            if n_gram in n_grams.keys():
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1

    return n_grams

In [22]:
def prob_for_n_gram(word, 
                    previous_n_gram, 
                    n_gram_counts, 
                    nplus1_gram_counts, 
                    vocabulary_size, 
                    k = 1.0):
    """
    Description: Function to calculate probability of a single word.
    :param word: Word to calculate probability,
    :param previous_n_gram: Previous n-gram(token) to calculate probability,
    :param n_gram_counts: N-gram list with same size of previous_n_gram,
    :param nplus1_gram_counts: N-grams list with one word more than n_gram_counts,
    :param vocabulary_size: Vocabulary size,
    :param k: k constant to calculate probability with smoothing.
    
    :return: Type(Float): Probability of a single word.
    """
    
    previous_n_gram = tuple(previous_n_gram)
    
    previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0
    
    denom = previous_n_gram_count + k * vocabulary_size

    nplus1_gram = previous_n_gram + (word,)
    
    nplus1_gram_count = nplus1_gram_counts[nplus1_gram] if nplus1_gram in nplus1_gram_counts else 0
    
    num = nplus1_gram_count + k

    prob = num / denom
    
    return prob

In [23]:
def probs(previous_n_gram, n_gram_counts, nplus1_gram_counts, vocabulary, k=1.0):
    
    """
    Description: Function to calculate probability of next n-gram.
    :param previous_n_gram: Previous n-gram(token) to calculate probability,
    :param n_gram_counts: N-gram list with same size of previous_n_gram,
    :param nplus1_gram_counts: N-grams list with one word more than n_gram_counts,
    :param vocabulary_size: Vocabulary size,
    :param k: k constant to calculate probability with smoothing.
    
    :return: Type(Dictionary): {Word : Probability}.
    """
    if type(previous_n_gram) is not tuple:
        previous_n_gram = tuple(previous_n_gram)

    vocabulary = vocabulary + ["<e>", "<unk>"]

    vocabulary_size = len(vocabulary)

    probabilities = {}

    for word in vocabulary:
        probability = prob_for_n_gram(word, 
                                      previous_n_gram, 
                                      n_gram_counts, 
                                      nplus1_gram_counts, 
                                      vocabulary_size, 
                                      k=k)
        
        probabilities[word] = probability

    return probabilities

In [24]:
def auto_complete(previous_tokens, n_gram_counts, nplus1_gram_counts, vocabulary, k=1.0, start_with=None):
    """
    Description: Function to complete previous words.
    :param previous_tokens: Previous tokens to calculate probability,
    :param n_gram_counts: N-gram list with same size of previous_n_gram,
    :param nplus1_gram_counts: N-grams list with one word more than n_gram_counts,
    :param vocabulary: Vocabulary,
    :param k: k constant to calculate probability with smoothing,
    :param start_with: Filter to start with token.
    
    :return: Type(Dictionary): {Word : Probability} Next token and probability (5 biggest probability from dictionary).
    """
    n = len(list(n_gram_counts.keys())[0]) 

    previous_n_gram = previous_tokens[-n:]

    probabilities = probs(previous_n_gram, n_gram_counts, nplus1_gram_counts, vocabulary, k=k)

    top_5_values = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)[:5]
    
    return top_5_values


In [25]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    """
    Description: Function to get suggestions.
    :param previous_tokens: Previous token,
    :param n_gram_counts_list: Number of n-grams,
    :param vocabulary: Vocabulary,
    :param k: k constant to calculate probability with smoothing,
    :param start_with: Filter to start with token.
    
    :return: Type(List): [(Word : Probability)] Next token and probability (5 biggest probability from dictionary).
    """
    count = len(n_gram_counts_list) + 1
    
    suggestions = []
    
    count_words_previous_tokens = len(previous_tokens)
    
    if (count_words_previous_tokens + 1) <= len(n_gram_counts_list):
        
        n_gram_counts = n_gram_counts_list[count_words_previous_tokens - 1]
        
        nplus1_gram_counts = n_gram_counts_list[count_words_previous_tokens]
        
        suggestion = auto_complete(previous_tokens, 
                                   n_gram_counts,
                                   nplus1_gram_counts, 
                                   vocabulary,
                                   k=k, 
                                   start_with=start_with)
        
        suggestions.append(suggestion)
        
    return suggestions

In [26]:
def get_suggestions_recursively(previous_tokens, 
                                n_gram_counts_list, 
                                vocabulary, 
                                suggestions_for_all_grams, 
                                quantity_ngrams_to_generate, 
                                k=1.0):
    """
    Description: Function to execute all possible suggestions in n-gram level based on first previous token informed.
    Observation: This function is recursive, it execute themselve until complete all n-grams leves according
    with parameter 'quantity_ngrams_to_generate'.
    :param previous_tokens: Previous tokens,
    :param n_gram_counts_list: Number of n-grams,
    :param vocabulary: Vocabulary,
    :param suggestions_for_all_grams: Complete list with all suggestions generated,
    :param quantity_ngrams_to_generate: Number of n-grams to generate,
    :param k: k constant to calculate probability with smoothing.
                 
                 
    :return: Void.
    """
    
    suggestions_per_ngram = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary)

    suggestions_for_all_grams.append([len(previous_tokens) + 1, previous_tokens, suggestions_per_ngram])
    
    if len(suggestions_per_ngram) > 0:
        for suggestion in suggestions_per_ngram[0]:

            previous_token_copied = previous_tokens.copy()
            previous_token_copied.append(suggestion[0])

            if len(previous_token_copied) <= (quantity_ngrams_to_generate + 1):
                get_suggestions_recursively(previous_token_copied, 
                                            n_gram_counts_list, 
                                            vocabulary, 
                                            suggestions_for_all_grams,
                                            quantity_ngrams_to_generate + 1)

In [27]:
def get_and_save_suggestions_from_csv_file(file_path_to_read_previous_tokens, file_path_to_save_suggestions):
    """
    Description: Function to read csv file with previous tokens, get suggestions for each previous tokens and
    save those suggestions into csv file result.
    :param file_path_to_read_previous_tokens: Csv file path with previous tokens to get suggestions from,
    :param file_path_to_save_suggestions: Csv file path to save suggestions.
                 
    :return: Void.
    """
    
    all_previous_tokens = load_csv_to_test_previous_tokens_list(file_path_to_read_previous_tokens)

    all_suggestions = []

    for previous_tokens in all_previous_tokens:
        suggestions = get_suggestions(previous_tokens.split(' '), n_gram_counts_list, vocabulary)
        
        if len(suggestions) > 0:      
            suggestions_to_save = []

            for i in range(len(suggestions[0])):
                suggestions_to_save.append(suggestions[0][i][0])

            all_suggestions.append((previous_tokens, suggestions_to_save))

    save_suggestions_to_csv_file(file_path_to_save_suggestions, all_suggestions)   

In [28]:
def display_5_combinations_top_most(n_grams_list):
    """
    Description: Function to display 5 words combinations that appeared the most in each n-gram for param 'n_grams_list'.
    :param n_grams_list: n-gram(s) list to show.
    
    :return: Void.
    """

    for i in range(0, len(n_gram_counts_list)):
        n_grams_5_appeared_most = sorted(n_gram_counts_list[i].items(), key=lambda x: x[1], reverse=True)[:5]
        print_info("%s-gram(s):" % (i + 1))
        data_for_dataframe = []

        for words_and_counts in n_grams_5_appeared_most:
            words_together_key = ''.join(words_and_counts[0])
            count_words_value = words_and_counts[1]

            each_line_dataframe = [words_together_key, count_words_value]

            data_for_dataframe.append(each_line_dataframe)

        df = pd.DataFrame(data_for_dataframe, columns = ['Words', 'Count'])
        display(df)
        print(" ")

In [29]:
def format_words_and_probability(df):
    """
    Description: Function to format words and probability to convert later into dataframe and print on screen.
    :param df: Pandas dataframe to format.
    
    :return: Type(List) Formated list with this configuration [(Previos Tokens, Suggestions, Probability)].
    """
    
    formated_results = []
    
    for index, row in df.iterrows():
        
        previous_tokens = ' '.join(row['previous_tokens'])
        
        if len(row['suggestions']) > 0:
            for suggestion in row['suggestions'][0]:
                formated_results.append((previous_tokens, previous_tokens + " " + suggestion[0], suggestion[1]))
      
    return formated_results

### Main flow

#### Define constants

In [72]:
PATH_TO_TEST_FILE_1_WORD = 'previous_tokens_for_test_1_word.csv'
PATH_TO_TEST_FILE_2_WORD = 'previous_tokens_for_test_2_word.csv'
PATH_TO_TEST_FILE_3_WORD = 'previous_tokens_for_test_3_word.csv'
PATH_TO_TEST_FILE_4_WORD = 'previous_tokens_for_test_4_word.csv'

PATH_TO_SAVE_SUGGESTIONS_1_WORD = 'suggestions_1_word.csv'
PATH_TO_SAVE_SUGGESTIONS_2_WORD = 'suggestions_2_word.csv'
PATH_TO_SAVE_SUGGESTIONS_3_WORD = 'suggestions_3_word.csv'
PATH_TO_SAVE_SUGGESTIONS_4_WORD = 'suggestions_4_word.csv'

#### Read and filter C# class files (.cs) from root repository downladed from: https://github.com/dotnet

In [26]:
# Define constants.
ROOT_DIRECTORY = "D:\DsTCC"

# Get all file names.
complete_file_names = get_all_c_sharp_complete_file_names_for_each_class(ROOT_DIRECTORY)

# Print first 10 files.
print_info("First 10 files:")

for file_name in complete_file_names[:10]:
    print(ntpath.basename(file_name))

# Print total number of files.
print_info("Number of files for N-grams:", new_line=True)
print("%s files." % (len(complete_file_names)))

####################################
First 10 files:
####################################
AssemblyResolution.cs
AssemblyResolver.cs
BuildTask.cs
BuildTask.Desktop.cs
DisposeAction.cs
EnumerableExtensions.cs
EnumExtensions.cs
ArgumentEscaper.cs
Command.cs
CommandFactory.cs


####################################
Number of files for N-grams:
####################################
201706 files.


#### Get source code of each file.

In [29]:
c_sharp_code_corpus = get_content_for_each_file(complete_file_names)
print_info("Source code of first C# file class:", c_sharp_code_corpus[0])

####################################
Source code of first C# file class:
####################################
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

#if NET472

using System;
using System.Collections.Generic;
using System.IO;
using System.Reflection;
using Microsoft.Build.Framework;
using Microsoft.Build.Utilities;

namespace Microsoft.DotNet
{
    internal static class AssemblyResolution
    {
        internal static TaskLoggingHelper Log;

        public static void Initialize()
        {
            AppDomain.CurrentDomain.AssemblyResolve += AssemblyResolve;
        }

        private static Assembly AssemblyResolve(object sender, ResolveEventArgs args)
        {
            var name = new AssemblyName(args.Name);

            if (!name.Name.Equals("System.Collections.Immutable", StringComparison.OrdinalIgnoreCase))
            {
                return null;
            }

            

#### Tokenize all files.

In [30]:
tokens = tokenize_all_files(c_sharp_code_corpus, first_x_corpus=0)

#### Export tokens to backup file

In [31]:
export_list_to_data_file(tokens, 'tokens_bkp.data')

#### Load tokens from backup file

In [26]:
tokens = load_from_data_file('tokens_bkp.data')

In [27]:
print_info("First 10 tokens:")
for token in flatten_list(tokens)[:10]:
    print(token) 

####################################
First 10 tokens:
####################################
['//', 'Licensed', 'to', 'the', '.NET', 'Foundation', 'under', 'one', 'or', 'more', 'agreements', '.']
['//', 'The', '.NET', 'Foundation', 'licenses', 'this', 'file', 'to', 'you', 'under', 'the', 'MIT', 'license', '.']
['#', 'if', 'NET|mask_number|']
['using', 'System', ';']
['using', 'System.Collections.Generic', ';']
['using', 'System.IO', ';']
['using', 'System.Reflection', ';']
['using', 'Microsoft.Build.Framework', ';']
['using', 'Microsoft.Build.Utilities', ';']
['namespace', 'Microsoft.DotNet']


#### Get tokens and vocabulary.

In [27]:
min_freq = 3
new_data_tokens, vocabulary = processing_vocabulary_and_unknown(tokens, min_freq)

#### Get n-grams count.

In [37]:
QUANTITY_NGRAMS_TO_GENERATE = 5

In [28]:
n_gram_counts_list = []
for n in range(1, QUANTITY_NGRAMS_TO_GENERATE + 1):
    n_model_counts = count_n_grams(new_data_tokens, n)
    n_gram_counts_list.append(n_model_counts)

#### Display 5 words combinations that appeared the most in each n-gram

In [31]:
display_5_combinations_top_most(n_gram_counts_list)

####################################
1-gram(s):
####################################


Unnamed: 0,Words,Count
0,),23203367
1,(,23202300
2,",",11792814
3,;,10033382
4,=,5635404


 
####################################
2-gram(s):
####################################


Unnamed: 0,Words,Count
0,((,9146342
1,);,5200301
2,;},2957248
3,)+,2927367
4,){,2854688


 
####################################
3-gram(s):
####################################


Unnamed: 0,Words,Count
0,(((,8644068
1,);},1520228
2,[|mask_number|],1325311
3,",|mask_number|x|mask_number|,",1293989
4,|mask_number|]),1059917


 
####################################
4-gram(s):
####################################


Unnamed: 0,Words,Count
0,((((,8554307
1,[|mask_number|]),1042245
2,"|mask_number|x|mask_number|,|mask_number|x|mas...",1020632
3,",|mask_number|x|mask_number|,|mask_number|x|ma...",1016741
4,--------,901374


 
####################################
5-gram(s):
####################################


Unnamed: 0,Words,Count
0,(((((,8489794
1,",|mask_number|x|mask_number|,|mask_number|x|ma...",1014999
2,----------,865768
3,*****,863235
4,"|mask_number|x|mask_number|,|mask_number|x|mas...",832296


 


#### Export ngrams x counts list, new data tokens and vocabulary to backup file

In [93]:
export_list_to_data_file(n_gram_counts_list, 'n_gram_counts_list.data')

In [32]:
export_list_to_data_file(new_data_tokens, 'new_data_tokens.data')

In [34]:
export_list_to_data_file(vocabulary, 'vocabulary.data')

#### Load ngrams x counts list, new data tokens and vocabulary from backup file

In [27]:
n_gram_counts_list = load_from_data_file('n_gram_counts_list.data')

In [None]:
new_data_tokens = load_from_data_file('new_data_tokens.data')

In [28]:
vocabulary = load_from_data_file('vocabulary.data')

#### Get suggestions recursively.

In [38]:
suggestions_for_all_grams = []

# First word to generate suggestion in 5 levels
previous_tokens = ["public"]

get_suggestions_recursively(previous_tokens, 
                            n_gram_counts_list, 
                            vocabulary, 
                            suggestions_for_all_grams, 
                            QUANTITY_NGRAMS_TO_GENERATE)
    
df = pd.DataFrame(suggestions_for_all_grams, columns=['ngram', 'previous_tokens', 'suggestions'])

display(df)

Unnamed: 0,ngram,previous_tokens,suggestions
0,2,[public],"[[(static, 0.13878612711693716), (void, 0.1022..."
1,3,"[public, static]","[[(void, 0.05360907345975815), (string, 0.0453..."
2,4,"[public, static, void]","[[(<unk>, 0.02085204659309168), (Main, 0.00996..."
3,5,"[public, static, void, <unk>]","[[((, 0.021744681053016246), (<, 0.00024488749..."
4,6,"[public, static, void, <unk>, (]",[]
...,...,...,...
776,6,"[public, int, <unk>, <, T]",[]
777,6,"[public, int, <unk>, <, ChannelType]",[]
778,6,"[public, int, <unk>, <, //]",[]
779,6,"[public, int, <unk>, <, Licensed]",[]


#### Format each n-gram level

##### N-grams suggested for the first token typed (public)

In [39]:
df_formated = df.copy()

for i in range(2, QUANTITY_NGRAMS_TO_GENERATE + 1):

    print_info("%s-gram(s):" % (i))
    
    df_ngrams = df_formated.query("ngram==%s" % (i))
    
    formated_results = format_words_and_probability(df_ngrams)
    
    df_ngrams_formated = pd.DataFrame(formated_results, columns=['Previous_Tokens', 'Suggestion', 'Probability'])
    
    df_top_5 = df_ngrams_formated.sort_values(by='Probability',ascending=False).iloc[:5,:]
    df_top_5 = df_top_5.style.format({'Probability': "{:.2%}"})
    display(df_top_5)
    
    print(" ")

####################################
2-gram(s):
####################################


Unnamed: 0,Previous_Tokens,Suggestion,Probability
0,public,public static,13.88%
1,public,public void,10.22%
2,public,public class,5.02%
3,public,public override,4.91%
4,public,public int,4.00%


 
####################################
3-gram(s):
####################################


Unnamed: 0,Previous_Tokens,Suggestion,Probability
5,public void,public void,10.30%
20,public int,public int m|mask_number|,5.61%
0,public static,public static void,5.36%
1,public static,public static string,4.53%
10,public class,public class Class|mask_number|,4.42%


 
####################################
4-gram(s):
####################################


Unnamed: 0,Previous_Tokens,Suggestion,Probability
25,public void,public void (,12.10%
100,public int m|mask_number|,public int m|mask_number| =,5.98%
50,public class Class|mask_number|,public class Class|mask_number| {,4.90%
5,public static string,public static string Property,4.73%
0,public static void,public static void,2.09%


 
####################################
5-gram(s):
####################################


Unnamed: 0,Previous_Tokens,Suggestion,Probability
125,public void (,public void ( ),9.93%
500,public int m|mask_number| =,public int m|mask_number| = |mask_number|,5.98%
250,public class Class|mask_number| {,public class Class|mask_number| { public,4.82%
25,public static string Property,public static string Property =,4.80%
0,public static void,public static void (,2.17%


 


### Test with common tokens C#

#### 1 previous token

In [109]:
get_and_save_suggestions_from_csv_file(PATH_TO_TEST_FILE_1_WORD, PATH_TO_SAVE_SUGGESTIONS_1_WORD)

#### 2 previous token

In [110]:
get_and_save_suggestions_from_csv_file(PATH_TO_TEST_FILE_2_WORD, PATH_TO_SAVE_SUGGESTIONS_2_WORD)

#### 3 previous token

In [111]:
get_and_save_suggestions_from_csv_file(PATH_TO_TEST_FILE_3_WORD, PATH_TO_SAVE_SUGGESTIONS_3_WORD)

#### 4 previous token

In [112]:
get_and_save_suggestions_from_csv_file(PATH_TO_TEST_FILE_4_WORD, PATH_TO_SAVE_SUGGESTIONS_4_WORD)