# Generating suggestions for writing source code in C# language based on NLP.


## GPT-2 approach

#### This notebook was created and adapted for the work of generating suggestions using some ideas and codes as reference the notebook of the author "Mangeshkar, Saurav" available at: 
https://www.kaggle.com/sauravmaheshkar/auto-completion-using-n-gram-models

### Import libraries.

In [1]:
import os
import ntpath   
from chardet import detect
import nltk
import re
import h5py
import numpy as np
from toolz import unique
import pickle
import pandas as pd
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
from pathlib import Path
import codecs
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

#### Classes

In [2]:
class BPE_token(object):
    
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.normalizer = Sequence([
            NFKC()
        ])
        self.tokenizer.decoder = ByteLevelDecoder()
        
    def bpe_train(self, paths):
        trainer = BpeTrainer(show_progress=True, 
                             inital_alphabet=ByteLevel.alphabet(), 
                             special_tokens=["<s>",
                                             "<pad>",
                                             "</s>",
                                             "<unk>",
                                             "<mask>"
                                            ])
        self.tokenizer.train(paths, trainer)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

### Generic functions.

In [3]:
def export_list_to_data_file(data, file_name):
    """
    Description: Function to export data into data file.
    :param data: Data to export,
    :param file_name: file name to export.
    :return: void.
    """

    with open(file_name, 'wb') as filehandle:
        pickle.dump(data, filehandle)

In [4]:
def load_from_data_file(file_name):
    """
    Description: Function to load data from file.
    :param file_name: file name to load data from.
    :return - Type(list): Data list.
    """
    
    data = []

    with open(file_name, 'rb') as filehandle:
        data = pickle.load(filehandle)

    return data

In [5]:
def print_info(title, message = None, new_line = False):
    """
    Description: Function to print info on screen
    :param title: Message title,
    :param message: Message to print,
    :param new_line: Indicates whether the first message will start with a line break or not.
    
    :return: void.
    """
    
    if new_line:
        print('\n')
    
    print("####################################")
    print(title)
    print("####################################")
    
    if message:
        print("%s\n" % (message))

In [6]:
def get_sequence_of_numbers_from_string(str):
    """
    Description: Function to extract all the sequence of numbers from the given string.
    :param str: String to extract sequence of numbers.
    
    :return - Type(Array): Array with sequence of numbers.
    """
    
    array_numbers = re.findall(r'[0-9]+', str)
    
    return array_numbers

In [7]:
def replace_sequence_of_numbers_for_mask(str_to_replace, 
                                         array_sequence_numbers_to_search, 
                                         mask_to_replace):
    """
    Description: Function to replace sequence of numbers for specific mask.
    :param str_to_replace: String to replace sequence of numbers,
    :param array_sequence_numbers_to_search: Sequence numbers to search for,
    :param mask_to_replace: Mask to replace each sequence.
    
    :return - Type(String): String with sequence of numbers replaced by mask.
    """
    
    for number_sequence in array_sequence_numbers_to_search:
        str_to_replace = re.sub(str(number_sequence), mask_to_replace, str_to_replace, 1)

    return str_to_replace

In [8]:
def get_encoding_type(file):
    """
    Description: Function to retrieve enconding type of file.
    :param file: File to get enconding.
    
    :return - Type(String): String with enconding type of file.
    """
        
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

In [9]:
def change_enconding(source_file, enconding):
    """
    Description: Function to change enconding of file.
    :param source_file: File to change enconding,
    :param enconding: Enconding to replace in source_file.
    
    :return: void.
    """
    
    from_codec = get_encoding_type(source_file)
    
    try: 
        target_file = source_file.replace(ntpath.basename(source_file), 
                                      "123%s" % (ntpath.basename(source_file))) 
        
        with open(source_file, 
                  'r', 
                  encoding=from_codec) as f, open(target_file, 
                                                  'w', 
                                                  encoding=enconding) as e:
                text = f.read()
                e.write(text)
                f.close()

        os.remove(source_file) 
        os.rename(target_file, source_file) 
        
    except UnicodeDecodeError:
        print("Decode error for file: '%s'" % (source_file))
    except UnicodeEncodeError:
        print("Encode error for file: '%s'" % (source_file))

In [10]:
def check_utf8_encode(file_name):
    try:
        content = codecs.open(file_name, encoding="utf-8", errors="strict").readlines()

        if content is not None:
            return True
    except UnicodeDecodeError:
        return False

    return False

In [11]:
def flatten_list(list_to_flatten):
    """
    Description: Function to flatten the given list.
    :param list_to_flatten: List to flatten.
    
    :returns - Type(List): Flat list.
    """   
    
    return [f for child_list in list_to_flatten for f in child_list]

In [12]:
def remove_duplicate_items_from_list(list_to_remove_duplicates):
    """
    Description: Function to remove duplicate itens from given list.
    :param list_to_remove_duplicates: List to remove duplicates.
    
    :returns - Type(List): List without duplicates.
    """  
    
    return list(map(list, unique(map(tuple, list_to_remove_duplicates))))

### Read C# repository functions.

#### Filter C# class files from root repository downladed from: https://github.com/dotnet

In [13]:
def get_all_c_sharp_complete_file_names_for_each_class(root_directory):
    """
    Description: Function to get all complete name of files with extension ".cs" (C# class).
    :param root_directory: Root directory of files.
    
    :return - Type(List): List with all file names of C# repository.
    """
    
    C_SHARP_CLASS_FILE_EXTENSION = ".cs"
    
    complete_name_of_files = []

    for root, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith(C_SHARP_CLASS_FILE_EXTENSION):
                complete_name_of_files.append(os.path.join(root, file))
    
    return complete_name_of_files

### Pre-processing functions.

In [14]:
def preprocess_code_to_tokens(source_code):
    """
    Description: Function to make pre-processing in source code and tokenize words.
    :param source_code: Source code to pre-processing.
    
    :returns - Type(List): List of tokens.
    """
    
    MASK_NUMBERS = "|mask_number|"
    
    code_sentences = source_code.split('\n')
    
    code_sentences = [c.strip() for c in code_sentences]
    
    code_sentences = [c for c in code_sentences if len(c) > 0]
    
    tokens = []
    
    for piece_of_code  in code_sentences:
        token = nltk.word_tokenize(piece_of_code)

        for i in range(len(token)):
            token[i] = replace_sequence_of_numbers_for_mask(
                            token[i],
                            get_sequence_of_numbers_from_string(token[i]),
                            MASK_NUMBERS)
            
        tokens.append(token)
        
    return tokens

In [15]:
def tokenize_all_files(complete_file_names, path_to_save_tokens):
    """
    Description: Function to tokenize all files and save into specific folder.
    :param complete_file_names: All C# files list (Name of each file),
    :param path_to_save_tokens: Path to save generated tokens.
    
    :returns - Void.
    """
    
    tokenizer = BPE_token()

    tokenizer.bpe_train([c for c in complete_file_names if check_utf8_encode(c) == True])

    tokenizer.save_tokenizer(path_to_save_tokens)

### GPT-2 Functions

In [16]:
def create_gpt2_model(tokens_path):
    
    # loading tokenizer from the saved model path
    tokenizer = GPT2Tokenizer.from_pretrained(tokens_path)
    tokenizer.add_special_tokens({
      "eos_token": "</s>",
      "bos_token": "<s>",
      "unk_token": "<unk>",
      "pad_token": "<pad>",
      "mask_token": "<mask>"
    })

    # creating the configurations from which the model can be made
    config = GPT2Config(
      vocab_size=tokenizer.vocab_size,
      bos_token_id=tokenizer.bos_token_id,
      eos_token_id=tokenizer.eos_token_id
    )

    # creating the model
    model = TFGPT2LMHeadModel(config)
    
    return (model, tokenizer)

In [17]:
def create_string_list_tokens(complete_file_names, tokenizer):
    
    single_string = ''
    
    for filename in complete_file_names:
        with open(file_name, "r", encoding='utf-8') as f:
            if check_utf8_encode(file_name) == True:
                x = f.read()
                single_string += x + tokenizer.eos_token

    return tokenizer.encode(single_string)

In [18]:
def create_tf_dataset_for_gpt2_training(tokens_list):
    
    examples = []
    block_size = 100
    BATCH_SIZE = 12
    BUFFER_SIZE = 1000
    
    for i in range(0, len(tokens_list) - block_size + 1, block_size):
        examples.append(tokens_list[i:i + block_size])
    
    inputs, labels = [], []
    
    for ex in examples:
        inputs.append(ex[:-1])
        labels.append(ex[1:])
    
    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
    
    return dataset

In [32]:
def config_model_to_gpt2(model):
    
    # defining our optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
    
    # definining our loss function
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    # defining our metric which we want to observe
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    
    # compiling the model
    model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])
    return model

In [33]:
def train_model_to_gpt2(model, dataset, num_epoch):
    
    history = model.fit(dataset, epochs=num_epoch)
    return history

In [21]:
def get_suggestions(previous_tokens, model, num_return_sequences, tokenizer):
    
    # encoding the input text
    input_ids = tokenizer.encode(previous_tokens, return_tensors='tf')
    # getting out output
    beam_output = model.generate(
      input_ids,
      max_length = 50,
      num_beams = 5,
      temperature = 0.7,
      no_repeat_ngram_size = 2,
      num_return_sequences = num_return_sequences
    )
    
    return tokenizer.decode(beam_output[0])

In [22]:
def display_5_combinations_top_most(n_grams_list):
    """
    Description: Function to display 5 words combinations that appeared the most in each n-gram for param 'n_grams_list'.
    :param n_grams_list: n-gram(s) list to show.
    
    :return: void.
    """

    for i in range(0, len(n_gram_counts_list)):
        n_grams_5_appeared_most = sorted(n_gram_counts_list[i].items(), key=lambda x: x[1], reverse=True)[:5]
        print_info("%s-gram(s):" % (i + 1))
        data_for_dataframe = []

        for words_and_counts in n_grams_5_appeared_most:
            words_together_key = ''.join(words_and_counts[0])
            count_words_value = words_and_counts[1]

            each_line_dataframe = [words_together_key, count_words_value]

            data_for_dataframe.append(each_line_dataframe)

        df = pd.DataFrame(data_for_dataframe, columns = ['Words', 'Count'])
        display(df)
        print(" ")

### Main flow

#### Read files

In [34]:
# Define constants.
ROOT_DIRECTORY = "D:\DsTCC"

# Get all file names.
complete_file_names = get_all_c_sharp_complete_file_names_for_each_class(ROOT_DIRECTORY)

# Print first 10 files.
print_info("First 10 files:")

for file_name in complete_file_names[:10]:
    print(ntpath.basename(file_name))

# Print total number of files.
print_info("Number of files for N-grams:", new_line=True)
print("%s files." % (len(complete_file_names)))

####################################
First 10 files:
####################################
AssemblyResolution.cs
AssemblyResolver.cs
BuildTask.cs
BuildTask.Desktop.cs
DisposeAction.cs
EnumerableExtensions.cs
EnumExtensions.cs
ArgumentEscaper.cs
Command.cs
CommandFactory.cs


####################################
Number of files for N-grams:
####################################
201706 files.


#### Tokenize all files.

In [24]:
DIRECTORY_TO_SAVE_GENERATED_TOKENS = 'GPT2_Generated_Tokens'


In [None]:
tokens = tokenize_all_files(complete_file_names, DIRECTORY_TO_SAVE_GENERATED_TOKENS)

#### Create GPT-2 model and vocabulary for tokens.

In [25]:
(model, tokenizer) = create_gpt2_model(DIRECTORY_TO_SAVE_GENERATED_TOKENS)

file GPT2_Generated_Tokens\config.json not found


#### Create tokens list

In [95]:
tokens_list = create_string_list_tokens(complete_file_names, tokenizer)

print_into("First 50 tokens from list:")

for token in tokens_list[:50]:
    print("Position in vocabulary: %s" % (token))

NameError: name 'print_into' is not defined

In [26]:
tokens_list = load_from_data_file('tokens_list.data')

In [27]:
print_info("First 50 tokens from list:")

for token in tokens_list[:50]:
    print("Position in vocabulary: %s" % (token))

####################################
First 50 tokens from list:
####################################
Position in vocabulary: 237
Position in vocabulary: 1257
Position in vocabulary: 417
Position in vocabulary: 324
Position in vocabulary: 636
Position in vocabulary: 739
Position in vocabulary: 912
Position in vocabulary: 866
Position in vocabulary: 1136
Position in vocabulary: 743
Position in vocabulary: 1034
Position in vocabulary: 1371
Position in vocabulary: 18
Position in vocabulary: 176
Position in vocabulary: 237
Position in vocabulary: 846
Position in vocabulary: 636
Position in vocabulary: 739
Position in vocabulary: 912
Position in vocabulary: 1370
Position in vocabulary: 465
Position in vocabulary: 804
Position in vocabulary: 417
Position in vocabulary: 1260
Position in vocabulary: 866
Position in vocabulary: 324
Position in vocabulary: 1262
Position in vocabulary: 1160
Position in vocabulary: 18
Position in vocabulary: 176
Position in vocabulary: 176
Position in vocabulary: 2

In [97]:
export_list_to_data_file(tokens_list, 'tokens_list.data')

#### Create tensor flow dataset for training

In [29]:
dataset = create_tf_dataset_for_gpt2_training(tokens_list)

In [28]:
print(tf.__version__)

2.7.0


#### Configure model and compile

In [34]:
model = config_model_to_gpt2(model)

#### Train model

In [35]:
NUM_EPOCH = 10
history = train_model_to_gpt2(model, dataset, NUM_EPOCH)

Epoch 1/10
    11/118166 [..............................] - ETA: 209:15:11 - loss: 8.8360 - logits_loss: 8.8360 - logits_accuracy: 0.1139 - past_key_values_1_accuracy: 0.0024 - past_key_values_2_accuracy: 0.0023 - past_key_values_3_accuracy: 0.0019 - past_key_values_4_accuracy: 0.0015 - past_key_values_5_accuracy: 0.0015 - past_key_values_6_accuracy: 0.0023 - past_key_values_7_accuracy: 0.0022 - past_key_values_8_accuracy: 0.0018 - past_key_values_9_accuracy: 0.0011 - past_key_values_10_accuracy: 0.0015 - past_key_values_11_accuracy: 0.0026 - past_key_values_12_accuracy: 0.0019

KeyboardInterrupt: 

#### Get suggestions

In [None]:
previous_tokens = 'public'
num_return_sequences = 5
suggestion = get_suggestions(previous_tokens, model, num_return_sequences, tokenizer)

#### Export tokens to backup file

In [31]:
export_list_to_data_file(tokens, 'tokens_bkp.data')

#### Load tokens from backup file

In [27]:
tokens = load_from_data_file('tokens_bkp.data')

In [28]:
print_info("First 50 tokens:")
for token in flatten_list(tokens)[:50]:
    print(token) 

####################################
First 50 tokens:
####################################
['//', 'Licensed', 'to', 'the', '.NET', 'Foundation', 'under', 'one', 'or', 'more', 'agreements', '.']
['//', 'The', '.NET', 'Foundation', 'licenses', 'this', 'file', 'to', 'you', 'under', 'the', 'MIT', 'license', '.']
['#', 'if', 'NET|mask_number|']
['using', 'System', ';']
['using', 'System.Collections.Generic', ';']
['using', 'System.IO', ';']
['using', 'System.Reflection', ';']
['using', 'Microsoft.Build.Framework', ';']
['using', 'Microsoft.Build.Utilities', ';']
['namespace', 'Microsoft.DotNet']
['{']
['internal', 'static', 'class', 'AssemblyResolution']
['{']
['internal', 'static', 'TaskLoggingHelper', 'Log', ';']
['public', 'static', 'void', 'Initialize', '(', ')']
['{']
['AppDomain.CurrentDomain.AssemblyResolve', '+=', 'AssemblyResolve', ';']
['}']
['private', 'static', 'Assembly', 'AssemblyResolve', '(', 'object', 'sender', ',', 'ResolveEventArgs', 'args', ')']
['{']
['var', 'name', '='

In [4]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained('D:/Organizado/Acadêmicos/Usp/Pós Graduação/Monografia/Programas/Python/TCC.RenanMartins.Pece.IA.GPT2/tokens_bkp.data')
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)

# creating the model
model = TFGPT2LMHeadModel(config)

ValueError: Calling GPT2Tokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.

#### Get tokens and vocabulary.

In [27]:
min_freq = 3
new_data_tokens, vocabulary = processing_vocabulary_and_unknown(tokens, min_freq)

#### Get n-grams count.

In [28]:
QUANTITY_NGRAMS_TO_GENERATE = 5

n_gram_counts_list = []
for n in range(1, QUANTITY_NGRAMS_TO_GENERATE + 1):
    n_model_counts = count_n_grams(new_data_tokens, n)
    n_gram_counts_list.append(n_model_counts)

#### Display 5 words combinations that appeared the most in each n-gram

In [31]:
display_5_combinations_top_most(n_gram_counts_list)

####################################
1-gram(s):
####################################


Unnamed: 0,Words,Count
0,),23203367
1,(,23202300
2,",",11792814
3,;,10033382
4,=,5635404


 
####################################
2-gram(s):
####################################


Unnamed: 0,Words,Count
0,((,9146342
1,);,5200301
2,;},2957248
3,)+,2927367
4,){,2854688


 
####################################
3-gram(s):
####################################


Unnamed: 0,Words,Count
0,(((,8644068
1,);},1520228
2,[|mask_number|],1325311
3,",|mask_number|x|mask_number|,",1293989
4,|mask_number|]),1059917


 
####################################
4-gram(s):
####################################


Unnamed: 0,Words,Count
0,((((,8554307
1,[|mask_number|]),1042245
2,"|mask_number|x|mask_number|,|mask_number|x|mas...",1020632
3,",|mask_number|x|mask_number|,|mask_number|x|ma...",1016741
4,--------,901374


 
####################################
5-gram(s):
####################################


Unnamed: 0,Words,Count
0,(((((,8489794
1,",|mask_number|x|mask_number|,|mask_number|x|ma...",1014999
2,----------,865768
3,*****,863235
4,"|mask_number|x|mask_number|,|mask_number|x|mas...",832296


 


#### Export ngrams x counts list, new data tokens and vocabulary to backup file

In [93]:
export_list_to_data_file(n_gram_counts_list, 'n_gram_counts_list.data')

In [32]:
export_list_to_data_file(new_data_tokens, 'new_data_tokens.data')

In [34]:
export_list_to_data_file(vocabulary, 'vocabulary.data')

#### Load ngrams x counts list, new data tokens and vocabulary from backup file

In [25]:
n_gram_counts_list = load_from_data_file('n_gram_counts_list.data')

In [26]:
new_data_tokens = load_from_data_file('new_data_tokens.data')

In [27]:
vocabulary = load_from_data_file('vocabulary.data')

#### Get suggestions.

In [84]:
suggestions_for_all_grams = []
previous_tokens = ["public"]

get_suggestions_recursively(previous_tokens, 
                            n_gram_counts_list, 
                            vocabulary, 
                            suggestions_for_all_grams, 
                            QUANTITY_NGRAMS_TO_GENERATE)
    
df = pd.DataFrame(suggestions_for_all_grams, columns=['ngram', 'previous_tokens', 'suggestions'])

display(df)

Unnamed: 0,ngram,previous_tokens,suggestions
0,2,[public],"[[(static, 0.13878612711693716), (void, 0.1022..."
1,3,"[public, static]","[[(void, 0.05360907345975815), (string, 0.0453..."
2,4,"[public, static, void]","[[(<unk>, 0.02085204659309168), (Main, 0.00996..."
3,5,"[public, static, void, <unk>]",[]
4,5,"[public, static, void, Main]",[]
...,...,...,...
151,5,"[public, int, <unk>, {]",[]
152,5,"[public, int, <unk>, (]",[]
153,5,"[public, int, <unk>, ;]",[]
154,5,"[public, int, <unk>, =]",[]


#### Format each n-gram level

In [120]:
def format_words_and_probability(df):
    
    formated_results = []
    
    for index, row in df.iterrows():
        
        previous_tokens = ' '.join(row['previous_tokens'])
        
        if len(row['suggestions']) > 0:
            for suggestion in row['suggestions'][0]:
                formated_results.append((previous_tokens, previous_tokens + " " + suggestion[0], suggestion[1]))
      
    return formated_results

In [122]:
df_formated = df.copy()
df_ngrams = df_formated.query("ngram==5")
df_ngrams

Unnamed: 0,ngram,previous_tokens,suggestions
3,5,"[public, static, void, <unk>]",[]
4,5,"[public, static, void, Main]",[]
5,5,"[public, static, void, foo|mask_number|]",[]
6,5,"[public, static, void, DynamicCSharpRunTest]",[]
7,5,"[public, static, void, Write]",[]
...,...,...,...
151,5,"[public, int, <unk>, {]",[]
152,5,"[public, int, <unk>, (]",[]
153,5,"[public, int, <unk>, ;]",[]
154,5,"[public, int, <unk>, =]",[]


##### N-grams suggested

In [140]:
df_formated = df.copy()

for i in range(2, QUANTITY_NGRAMS_TO_GENERATE):

    print_info("%s-gram(s):" % (i))
    
    df_ngrams = df_formated.query("ngram==%s" % (i))
    
    formated_results = format_words_and_probability(df_ngrams)
    
    df_ngrams_formated = pd.DataFrame(formated_results, columns=['Previous_Tokens', 'Suggestion', 'Probability'])
    
    df_top_5 = df_ngrams_formated.sort_values(by='Probability',ascending=False).iloc[:5,:]
    df_top_5 = df_top_5.style.format({'Probability': "{:.2%}"})
    display(df_top_5)
    
    print(" ")

####################################
2-gram(s):
####################################


Unnamed: 0,Previous_Tokens,Suggestion,Probability
0,public,public static,13.88%
1,public,public void,10.22%
2,public,public class,5.02%
3,public,public override,4.91%
4,public,public int,4.00%


 
####################################
3-gram(s):
####################################


Unnamed: 0,Previous_Tokens,Suggestion,Probability
5,public void,public void,10.30%
20,public int,public int m|mask_number|,5.61%
0,public static,public static void,5.36%
1,public static,public static string,4.53%
10,public class,public class Class|mask_number|,4.42%


 
####################################
4-gram(s):
####################################


Unnamed: 0,Previous_Tokens,Suggestion,Probability
25,public void,public void (,12.10%
100,public int m|mask_number|,public int m|mask_number| =,5.98%
50,public class Class|mask_number|,public class Class|mask_number| {,4.90%
5,public static string,public static string Property,4.73%
0,public static void,public static void,2.09%


 
