In [3]:
import numpy as np
import pandas as pd 
import keras 
from keras.layers import *
from keras.models import *
from keras.activations import *
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import os
import itertools
import string
import tensorflow as tf
from itertools import product
import math
from utils import lex
from utils import yacc
from utils import cpp
import zipfile

# Here We Will Configure The Global Variables

In [45]:
zip_name = "./data.zip"


cols = ["code", "block"]
# The first name is the name of the containing folder of the codes 
archive = zipfile.ZipFile('data.zip', 'r')
list_files = archive.namelist()[1:]
# Now We will make a scanner for the C++ language
scanner = lex.lex(cpp)
# If any thing was considered fault at the line i, we will consider all the lines [i - range_n, i + range_n) to be fault 
range_n = 4
# Then We Define The literals of the program
lits = cpp.literals
# Then We Define The Tokens
toks = list(cpp.tokens)
# We remove the White Space token to add it later 
toks.remove("CPP_WS")
# We add the White Space token here because we want it to have the value of zero, we'll use this latter for padding lines of code
toks.insert(0, "CPP_WS")
# Tok 2 N : a dictionary from tokens to thier integer, mapped, value
tok2n = dict(zip(toks + [i for i in lits], itertools.count()))
# N 2 Tok : a dictionary from integers to thier token, mapped, value
n2tok = dict(zip(itertools.count(), toks + [i for i in lits]))

# The maximum value we allow in as a constant value in a code
max_v = 2147483647 - 1

# The amount of 
WEIGHTS_FOR_LOSS = np.array([[2,0.5],[0.1,0.1]])

cons_per_line = 10


# Here We Will Make A Custom Loss Function

In [41]:
def get_loss_function(weights, rnn=True):
        
    '''
    gives us the loss function
    '''
    def w_categorical_crossentropy_mine(y_true, y_pred):
        nb_cl = len(weights)
        
        if(not rnn):
            final_mask = K.zeros_like(y_pred[:, 0])
            y_pred_max = K.max(y_pred, axis=1)
            y_pred_max = K.reshape(y_pred_max, (K.shape(y_pred)[0], 1))
            y_pred_max_mat = K.equal(y_pred, y_pred_max)
            for c_p, c_t in product(range(nb_cl), range(nb_cl)):
                final_mask += ( weights[c_t, c_p] * K.cast(y_pred_max_mat, tf.float32)[:, c_p] * K.cast(y_true, tf.float32)[:, c_t]  )
            return K.categorical_crossentropy(y_true, y_pred, True) * final_mask 
        else:
            final_mask = K.zeros_like(y_pred[:, :,0])
            y_pred_max = K.max(y_pred, axis=2)
            y_pred_max = K.reshape(y_pred_max, (K.shape(y_pred)[0], K.shape(y_pred)[1], 1))
            y_pred_max_mat = K.equal(y_pred, y_pred_max)
            for c_p, c_t in product(range(nb_cl), range(nb_cl)):
                final_mask += ( weights[c_t, c_p] * K.cast(y_pred_max_mat, tf.float32)[:, :,c_p] * K.cast(y_true, tf.float32)[:, :,c_t]  )
            return K.categorical_crossentropy(y_true, y_pred, True) * final_mask 

            
    return w_categorical_crossentropy_mine

# Reading The Data From The Archive

In [42]:
def get_data(list_files, archive):
    '''
    reads the data and handles the range_n number
    '''
    res = []
    for i in list_files:
        try :
            x = pd.read_csv(archive.open(i), sep = "`")
            x = x[x.columns[:-1]]
            res.append(x)
        except Exception: 
            print(i)
            continue
    resF = []
    for n_i, i in enumerate(res) :
        
        if i.shape[0] is 0 :
            continue
            
        a = i.values
        b = a.copy()
#         This line of code will change the data from (Features, beingWrong) to (Features, beginRight, beingWrong)
        b = np.concatenate([b[:, :-1], b[:, -1:].astype(np.int) ^ 1, b[:, -1:]], axis = -1)
        
        
        for j in range(len(b)):
            if np.sum(a[j - range_n : j + range_n, -1]) > 0 :
#                 This was explained before the declaration of range_n
                b[j, -1] = 1
                b[j, -2] = 0
        for x in range(len(b)):
            for y in range(len(b[x])):
#                 Here we will try to change any thing that is not the code it self and which is a string into numbers 
                if y > 1 :
                    if type(b[x, y]) == str :
                        try :
                            float(b[x, y].strip())
                        except Exception : 
                            b[x, y] = -3
                elif y == 1 :
                    b[x, y] = "DATA DOES NOT MATTER"
#         By 0s we mean the code being fine and so on
        b = pd.DataFrame(b, columns=list(i.columns)[:ind_2] + ["0s", "1s"])
        b.replace("#empty", np.nan, inplace =True)
        resF.append(b.dropna())
        
        
    print("data was read and changed")    
    return resF
        
        
    
    
    

# Lexical Scanner Function

In [44]:
def get_replacement(scanner, string_in):
    
    '''
    gets a string and returns the None, 2 which is the tokenized version
    '''
    try :
        scanner.input(string_in)
    except Exception as e :
        print("Exception in using the lex", e)
        print(string_in)
    token = scanner.token()
    
    
#     id2n and n2id are the same as n2tok tok2n but they are extended to contain the information of the symbol table of each code separately
    id2n = dict(zip([i for i in lits], [tok2n[i] for i in lits]))
    n2id = dict(zip([tok2n[i] for i in lits], [i for i in lits]))
    
    n_id = len(lits) + 1
    
    
    res = []
    
    while token is not None :
        
        t = token.type
        
#         If we have recieved a token and it is not something we need to use ord for
        if t in cpp.tokens :
#             Reciving a white space
            if token.type == cpp.tokens[cpp.tokens.index("CPP_WS")]:
                #this is because this will make it easier for us to pad our data
                v = 0
#             Reciving an ID from the code
            elif token.type == cpp.tokens[cpp.tokens.index("CPP_ID")]:
                v = token.value
#                 Checking if need to add the id to n2id or not
                if v in id2n.keys() :
                    pass
                else :
                    id2n[v] = n_id
                    n2id[n_id] = v
                    
                    n_id += 1
                v = id2n[v]
#             If we receive a string (We don't use the value of strings)
            elif token.type == cpp.tokens[cpp.tokens.index("CPP_STRING")]:
                v = -1
#             If we recive #
            elif token.type == cpp.tokens[cpp.tokens.index("CPP_POUND")]:
                v = -2
#             If we recive ##
            elif token.type == cpp.tokens[cpp.tokens.index("CPP_DPOUND")]:
                v = -3
#             If we recive char
            elif token.type == cpp.tokens[cpp.tokens.index("CPP_CHAR")]:
                v = -4
            elif token.type in cpp.tokens[3:]:
                print("some thing went really wrong")
#             Parsing the value of constant values
            else:
                try :
                    tv = token.value.lower()
                    if tv[-1] == "l" : 
                        tv = tv[:-1]
                    if tv[-1] == "u" : 
                        tv = tv[:-1]
                    if "x" in  tv :
                        v = int(tv, base = 16)
                    elif tv[-1].lower() == "l":
                        if tv[-2].lower() == "u" :
                            v = float(tv[:-2])
                        else :
                            v = float(tv[:-1])
                    else :
                        v = float(tv)
                    v = np.clip(v, - max_v, max_v)
                    
                except Exception as e :
                    print("Couldn't scan this number", token)
                    return
                
                
            
            
        else :
            v = ord(t)
        try :
            t = tok2n[t]
        except Exception :
            n = len(id2n.keys()) + 1 
            tok2n[t] = n
            n2tok[n] = t
            id2n[t] = n
            n2id[n] = t
            t = tok2n[t]
            
        res.append([t, v])
        token = scanner.token()
        
    res = np.array(res)
    
    return res
        
    
    
    