# Generating suggestions for writing source code in C# language based on NLP.


## N-Gram approach

#### This notebook was created and adapted for the work of generating suggestions using some ideas and codes as reference the notebook of the author "Saurav Mangeshkar" available at: 
https://www.kaggle.com/sauravmaheshkar/auto-completion-using-n-gram-models

### Import libraries.

In [1]:
import os
import ntpath   
from chardet import detect
import nltk
import re
import h5py
import numpy as np
from toolz import unique

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Generic functions.

In [2]:
def print_info(title, message = None, new_line = False):
    """
    Description: Function to print info on screen
    :param title: Message title,
    :param message: Message to print,
    :param new_line: Indicates whether the first message will start with a line break or not.
    
    :return: void.
    """
    
    if new_line:
        print('\n')
    
    print("####################################")
    print(title)
    print("####################################")
    
    if message:
        print("%s\n" % (message))

In [3]:
def get_sequence_of_numbers_from_string(str):
    """
    Description: Function to extract all the sequence of numbers from the given string.
    :param str: String to extract sequence of numbers.
    
    :return - Type(Array): Array with sequence of numbers.
    """
    
    array_numbers = re.findall(r'[0-9]+', str)
    
    return array_numbers

In [4]:
def replace_sequence_of_numbers_for_mask(str_to_replace, 
                                         array_sequence_numbers_to_search, 
                                         mask_to_replace):
    """
    Description: Function to replace sequence of numbers for specific mask.
    :param str_to_replace: String to replace sequence of numbers,
    :param array_sequence_numbers_to_search: Sequence numbers to search for,
    :param mask_to_replace: Mask to replace each sequence.
    
    :return - Type(String): String with sequence of numbers replaced by mask.
    """
    
    for number_sequence in array_sequence_numbers_to_search:
        str_to_replace = re.sub(str(number_sequence), mask_to_replace, str_to_replace, 1)

    return str_to_replace

In [5]:
def get_encoding_type(file):
    """
    Description: Function to retrieve enconding type of file.
    :param file: File to get enconding.
    
    :return - Type(String): String with enconding type of file.
    """
        
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

In [6]:
def change_enconding(source_file, enconding):
    """
    Description: Function to change enconding of file.
    :param source_file: File to change enconding,
    :param enconding: Enconding to replace in source_file.
    
    :return: void.
    """
    
    from_codec = get_encoding_type(source_file)
    
    try: 
        target_file = source_file.replace(ntpath.basename(source_file), 
                                      "123%s" % (ntpath.basename(source_file))) 
        
        with open(source_file, 
                  'r', 
                  encoding=from_codec) as f, open(target_file, 
                                                  'w', 
                                                  encoding=enconding) as e:
                text = f.read()
                e.write(text)
                f.close()

        os.remove(source_file) 
        os.rename(target_file, source_file) 
        
    except UnicodeDecodeError:
        print("Decode error for file: '%s'" % (source_file))
    except UnicodeEncodeError:
        print("Encode error for file: '%s'" % (source_file))

In [7]:
def flatten_list(list_to_flatten):
    """
    Description: Function to flatten the given list.
    :param list_to_flatten: List to flatten.
    
    :returns - Type(List): Flat list.
    """   
    
    return [f for child_list in list_to_flatten for f in child_list]

In [8]:
def remove_duplicate_items_from_list(list_to_remove_duplicates):
    """
    Description: Function to remove duplicate itens from given list.
    :param list_to_remove_duplicates: List to remove duplicates.
    
    :returns - Type(List): List without duplicates.
    """  
    
    return list(map(list, unique(map(tuple, list_to_remove_duplicates))))

### Read C# repository functions.

#### Filter C# class files from root repository downladed from: https://github.com/dotnet

In [9]:
def get_all_c_sharp_complete_file_names_for_each_class(root_directory):
    """
    Description: Function to get all complete name of files with extension ".cs" (C# class).
    :param root_directory: Root directory of files.
    
    :return - Type(List): List with all file names of C# repository.
    """
    
    C_SHARP_CLASS_FILE_EXTENSION = ".cs"
    
    # List with complete path for all C# files.
    complete_name_of_files = []

    # Loop for all files with C# class extension.
    for root, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith(C_SHARP_CLASS_FILE_EXTENSION):
                # Append the file name to the list
                complete_name_of_files.append(os.path.join(root, file))
    
    return complete_name_of_files

In [10]:
def get_content_for_each_file(complete_name_of_files):
    """
    Description: Function to get content of each source code file.
    :param complete_name_of_files: List with name of each file downladed from repository.
    
    :return - Type(List): Corpus with all C# source code.
    """
    
    c_sharp_code_corpus = []

    for file_name in complete_name_of_files:
        try:
            with open(file_name, "r", encoding="utf8") as physical_file:
                c_sharp_code_corpus.append(physical_file.read())
                physical_file.close()
        except:
            change_enconding(file_name)
            
    return c_sharp_code_corpus

### Pre-processing functions.

In [11]:
def preprocess_code_to_tokens(source_code):
    """
    Description: Function to make pre-processing in source code and tokenize words.
    :param source_code: Source code to pre-processing.
    
    :returns - Type(List): List of tokens.
    """
    
    # Constant to replace numbers in tokens.
    MASK_NUMBERS = "|mask_number|"
    
    # Split by new line character.
    code_sentences = source_code.split('\n')
    
    # Remove leading and trailing spaces.
    code_sentences = [c.strip() for c in code_sentences]
    
    # Drop empty sentences.
    code_sentences = [c for c in code_sentences if len(c) > 0]
    
    # Empty list to hold tokens after ntlk process.
    tokens = []
    
    # Iterate through code sentences.
    for piece_of_code  in code_sentences:
        # Convert to a list of words.
        token = nltk.word_tokenize(piece_of_code)
        
        # Replace sequence of numbers to mask.
        for i in range(len(token)):
            token[i] = replace_sequence_of_numbers_for_mask(
                            token[i],
                            get_sequence_of_numbers_from_string(token[i]),
                            MASK_NUMBERS)
            
        tokens.append(token)
        
    return tokens

In [12]:
def tokenize_all_files(c_sharp_code_corpus, first_x_corpus = 0):
    """
    Description: Function to tokenize all files.
    :param c_sharp_code_corpus: Complete list of C# corpus (Source code).
    :param first_x_corpus: Option to tokenize only the first X elements. Default: 0 - Tokenize all files.
    
    :returns - Type(List): List of tokens.
    """
    
    tokens = []
    
    corpus_copy = c_sharp_code_corpus[:]
    
    if first_x_corpus > 0:
        corpus_copy = corpus_copy[:first_x_corpus]
        
    for corpus in corpus_copy:
        tokens.append(preprocess_code_to_tokens(corpus))

    return tokens

### N-gram functions.

In [13]:
def count_the_words_for_code(code_tokens):
    """
    Description: Function to count words for source codes.
    :param code_tokens: Tokens of all source repository.

    :returns - (Dictionary): Dictionary with words count { Key - "Word", Value = Count }.
    """
    
    code_counts = {}

    for code_token in code_tokens: 
        for token in code_token:
            for token_aux in token:
                if token_aux not in code_counts.keys():
                    code_counts[token_aux] = 1
                else:
                    code_counts[token_aux] += 1 
            
    return code_counts

In [14]:
def handling_out_of_code_vocabulary(tokens, count_threshold):
    """
    Description: Function to create a dictionary of words (piece of code) that are not present in
    current corpus.
    :param tokens: List of tokens.
    :param count_threshold: Limit of words to add in closed dictionary.
    
    :returns - Type(List): Closed vocabulary.
    """
        
    closed_vocabulary = []

    words_count = count_the_words_for_code(tokens)
    
    for word, count in words_count.items():
        if count >= count_threshold :
          closed_vocabulary.append(word)

    return closed_vocabulary

In [15]:
def unknown_tokenize(tokens, vocabulary, unknown_token = "<unk>"):
    """
    Description: Function to append list of tokens with unknown words (piece of code).
    :param tokens: List of tokens,
    :param vocabulary: Vocabulary of code,
    :param unknown_token: Unknown token. Default: <unk>
    
    :returns - Type(List): List of tokens with new unknown tokens.
    """
    
    vocabulary = set(vocabulary)
    
    new_tokenized_sentences = []
    
    for sentence in tokens:
        new_sentence = []
        
        for token in sentence:
            for token_aux in token:
                if token_aux in vocabulary:
                    new_sentence.append(token_aux)
                else:
                    new_sentence.append(unknown_token)

        new_tokenized_sentences.append(new_sentence)
    
    return new_tokenized_sentences

In [16]:
def processing_vocabulary_and_unknown(tokens, count_threshold):
    """
    Description: Function to process vocabulary and unknown tokens.
    :param tokens: List of tokens,
    :param count_threshold: Limit do define wether some word is unknown or not.
    
    :returns - Type(List, List): Tokens list and Vocabulary list.
    """
        
    # Auxiliar tokens to copy
    tokens_aux = tokens[:]
    
    # Get closed Vocabulary
    vocabulary = handling_out_of_code_vocabulary(tokens_aux, count_threshold)

    # Updated training dataset
    new_token_data = unknown_tokenize(tokens_aux, vocabulary)

    return new_token_data, vocabulary

In [17]:
def count_n_grams(tokens, ngrams_number, start_token_delimiter = "<s>", end_token_delimiter = "<e>"):
    """
    Description: Function to count n-grams.
    :param tokens: List of tokens,
    :param ngrams_number: Number of n-grams,
    :param start_token_delimiter: Start token delimiter,
    :param end_token_delimiter: End token delimiter.
    
    :returns - Type(Dictionary): Dictionary with n-grams.
    """
    
    n_grams = {}

    for sentence in tokens:
        sentence = [start_token_delimiter]*ngrams_number + sentence + [end_token_delimiter]

        sentence = tuple(sentence)

        m = len(sentence) if ngrams_number==1 else len(sentence)-1

        for i in range(m):
          n_gram = sentence[i:i+ngrams_number]

          if n_gram in n_grams.keys():
            n_grams[n_gram] += 1
          else:
            n_grams[n_gram] = 1

    return n_grams

In [18]:
def prob_for_single_word(word, 
                         previous_n_gram, 
                         n_gram_counts, 
                         nplus1_gram_counts, 
                         vocabulary_size, 
                         k = 1.0):
    """
    Description: Function to calculate probability of a single word.
    :param word: Word to calculate probability,
    :param nplus1_gram_counts: n-grams count to plus one,
    :param vocabulary_size: Vocabulary size,
    :param k: k constant to calculate.
    
    :returns - Type(Float): Probability of a single word.
    """
    
    # Convert the previous_n_gram into a tuple 
    previous_n_gram = tuple(previous_n_gram)

    # Calculating the count, if exists from our freq dictionary otherwise zero
    previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0

    # The Denominator
    denom = previous_n_gram_count + k * vocabulary_size

    # previous n-gram plus the current word as a tuple
    nplus1_gram = previous_n_gram + (word,)

    # Calculating the nplus1 count, if exists from our freq dictionary otherwise zero 
    nplus1_gram_count = nplus1_gram_counts[nplus1_gram] if nplus1_gram in nplus1_gram_counts else 0

    # Numerator
    num = nplus1_gram_count + k

    # Final Fraction
    prob = num / denom
    return prob

In [19]:
def probs(previous_n_gram, n_gram_counts, nplus1_gram_counts, vocabulary, k=1.0) -> 'dict':
    
    """
    Description: Function to calculate probability of next n-gram.
    :param previous_n_gram: Word to calculate probability,
    :param n_gram_counts: Number of n-grams,
    :param nplus1_gram_counts: n-grams count to plus one,
    :param vocabulary: Vocabulary,
    :param k: k constant to calculate.
    
    :returns - Type(Float): Probability of next n-gram.
    """
    
    # Convert to Tuple
    previous_n_gram = tuple(previous_n_gram)

    # Add end and unknown tokens to the vocabulary
    vocabulary = vocabulary + ["<e>", "<unk>"]

    # Calculate the size of the vocabulary
    vocabulary_size = len(vocabulary)

    # Empty dict for probabilites
    probabilities = {}

    # Iterate over words 
    for word in vocabulary:

        # Calculate probability
        probability = prob_for_single_word(word, previous_n_gram, 
                                               n_gram_counts, nplus1_gram_counts, 
                                               vocabulary_size, k=k)
        # Create mapping: word -> probability
        probabilities[word] = probability

    return probabilities

In [20]:
def auto_complete(previous_tokens, n_gram_counts, nplus1_gram_counts, vocabulary, k=1.0, start_with=None):
    """
    Description: Function to complete previous words.
    :param previous_tokens: Previous token,
    :param n_gram_counts: Number of n-grams,
    :param nplus1_gram_counts: n-grams count to plus one,
    :param vocabulary: Vocabulary,
    :param k: k constant to calculate,
    :param start_with: Filter to start with token.
    
    :returns - Type(String, Float): Next token and probability.
    """
    
    # length of previous words
    n = len(list(n_gram_counts.keys())[0]) 

    # most recent 'n' words
    previous_n_gram = previous_tokens[-n:]

    # Calculate probabilty for all words
    probabilities = probs(previous_n_gram,n_gram_counts, nplus1_gram_counts,vocabulary, k=k)

    # Intialize the suggestion and max probability
    suggestion = None
    max_prob = 0

    # Iterate over all words and probabilites, returning the max.
    # We also add a check if the start_with parameter is provided
    for word, prob in probabilities.items():

        if start_with != None: 

            if not word.startswith(start_with):
                continue 

        if prob > max_prob: 

            suggestion = word
            max_prob = prob

    return suggestion, max_prob

In [21]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    """
    Description: Function to get suggestions.
    :param previous_tokens: Previous token,
    :param n_gram_counts_list: Number of n-grams,
    :param vocabulary: Vocabulary,
    :param k: k constant to calculate,
    :param start_with: Filter to start with token.
    
    :returns - Type(String, Float): Next token and probability.
    """
    
    # See how many models we have
    count = len(n_gram_counts_list)
    
    # Empty list for suggestions
    suggestions = []
    
    # IMP: Earlier "-1"
    
    # Loop over counts
    for i in range(count-1):
        
        # get n and nplus1 counts
        n_gram_counts = n_gram_counts_list[i]
        nplus1_gram_counts = n_gram_counts_list[i+1]
        
        # get suggestions 
        suggestion = auto_complete(previous_tokens, n_gram_counts,
                                    nplus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        # Append to list
        suggestions.append(suggestion)
        
    return suggestions

### Main flow

#### Read files

In [22]:
# Define constants.
ROOT_DIRECTORY = "D:\DsTCC"

# Get all file names.
complete_file_names = get_all_c_sharp_complete_file_names_for_each_class(ROOT_DIRECTORY)

# Print first 10 files.
print_info("First 10 files:")

for file_name in complete_file_names[:10]:
    print(ntpath.basename(file_name))

# Print total number of files.
print_info("Number of files for N-grams:", new_line=True)
print("%s files." % (len(complete_file_names)))

####################################
First 10 files:
####################################
ProcessResult.cs
ProcessUtil.cs
Program.cs
RunTestsOptions.cs
TestRunner.cs
Program.cs
CreateFrameworkListFile.cs
DownloadFile.cs
FileUtilities.cs
GenerateGuid.cs


####################################
Number of files for N-grams:
####################################
57844 files.


#### Get source code of each file.

In [23]:
c_sharp_code_corpus = get_content_for_each_file(complete_file_names)
print_info("Source code of first C# file class:", c_sharp_code_corpus[0])

####################################
Source code of first C# file class:
####################################
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

namespace RunTests
{
    public class ProcessResult
    {
        public ProcessResult(string standardOutput, string standardError, int exitCode)
        {
            StandardOutput = standardOutput;
            StandardError = standardError;
            ExitCode = exitCode;
        }

        public string StandardOutput { get; }
        public string StandardError { get; }
        public int ExitCode { get; }
    }
}




#### Tokenize all files.

In [None]:
tokens = tokenize_all_files(c_sharp_code_corpus, first_x_corpus=0)

print_info("First 50 tokens:")
for token in flatten_list(tokens)[:50]:
    print(token) 

#### Get tokens and vocabulary.

In [None]:
min_freq = 3
new_data_tokens, vocabulary = processing_vocabulary_and_unknown(tokens, min_freq)

#### Get n-grams count.

In [None]:
n_gram_counts_list = []
for n in range(1, 3):
    n_model_counts = count_n_grams(new_data_tokens, n)
    n_gram_counts_list.append(n_model_counts)

#### Get suggestions.

In [None]:
previous_tokens = ["ReadFile", "("]
suggestion = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

display(suggestion)