In [3]:
import numpy as np
import math


class DataProcessor:

    #collectors tags
    collectors=[]

    #columnsTags - names of columns (names in first row in the data file)
    columnsTags=[]

    #dataMatrix as loaded from file (first row excluded)
    dataMatrix=[]

    # each row has value and two vectors for (collector - features - onehot FGC )
    #we applied normalization(mean,var) on numerical column, and one hot(0,1) on non-numerical columns
    Onehot_Normalized_Matrix = []

    #max number of transactions made by one collector
    maxSequenceLength=0
    #vector to save number of transactions made by each collector
    sequencesLengths=[]

    # X and Y vectors to be retrieved (3D)
    # X has inputs (sequences of one collector) to the RNN
    # Y has the desired FGC as onehot vector with zeros and one at the index of the desired product
    X=[]
    Y=[]

    #the only function to be called (called to generate X,Y Matricies to be loaded in the RNN)
    #called from the RNN_Model.py file
    #it takes path of file (Data.csv, beside the python file) , and the count of rows to be loaded from file
    #count default is 0 --> means load all row in file
    def Generate_X_Y_Vectors(self,path,count=0,FGC_Dic=0):

        self.LoadFile(path,count)
        self.SortMatrix()
        self.Generate_OneHot_Normalized_Matrix(FGC_Dic)
        self.maxSequenceLength=self.getMaxSeqLen()
        self.generate_X_Y_vectors()

    #convert onrhot_normalized_matrix to the X,Y (COllaborative filtering last step)
    def generate_X_Y_vectors(self):

        #empty target for non sequence transactions (last Index = 1)
        EmptyTarget=[0]*len(self.Onehot_Normalized_Matrix[0][2])
        EmptyTarget[len(self.Onehot_Normalized_Matrix[0][2])-1]=1

        # empty transaction for padding ( a transaction with empty target)
        EmptyTransaction=[0]*(len(self.Onehot_Normalized_Matrix[0][1]))
        EmptyTransaction.extend(EmptyTarget)

        #loop on seuqnces to put each collector sequences in a 2D vector --> as well as the desired target vector (one hot)
        index=0
        for i in range(len(self.sequencesLengths)):
            sequence=[] #sequence is an empty vector to be filled

            #if only one sequence (collector made one transaction) --> input: empty trans, output: the actual transaction
            if self.sequencesLengths[i] == 1:
                tmp = self.Onehot_Normalized_Matrix[index][1]
                tmp.extend(EmptyTarget)
                sequence.append(tmp)

                #loop and add empty transactions to make all sequences with the same size (size of max sequence)
                for j in range(len(sequence),self.maxSequenceLength):
                    sequence.append(EmptyTransaction)

                #add Input,Output vectors to X,Y

                self.X.append(sequence)
                self.Y.append(self.Onehot_Normalized_Matrix[index][2])

                index += 1

            #if may transactions made by a collector (input: all transaction except last, output: last transaction)
            else:

                #start with first transaction
                tmp=self.Onehot_Normalized_Matrix[index][1]
                tmp.extend(self.Onehot_Normalized_Matrix[index][2])
                sequence.append(tmp)
                index+=1

                #loop and add next transactions
                for j in range(self.sequencesLengths[i]-2):
                    tmp = self.Onehot_Normalized_Matrix[index][1]
                    tmp.extend(self.Onehot_Normalized_Matrix[index][2])
                    sequence.append(tmp)
                    index+=1

                #make all sequences with the same length
                for j in range(len(sequence),self.maxSequenceLength):
                    sequence.append(EmptyTransaction)



                #add new row to X,Y
                self.X.append(sequence)
                self.Y.append(self.Onehot_Normalized_Matrix[index][2])

                index += 1


    #count the max transactions by one collector, and fill them in a vector
    def getMaxSeqLen(self):
        maxLen=0
        count=1
        #loop on the transactions matrix
        for i in range(1,len(self.Onehot_Normalized_Matrix)):
            #check if new sequence started
            if self.Onehot_Normalized_Matrix[i][0] != self.Onehot_Normalized_Matrix[i-1][0]:
                #save count if its bigger the last biggest sequence length
                if maxLen<count:
                    maxLen=count
                #save seqence length of current collector
                self.sequencesLengths.append(count)
                count=1

                self.collectors.append(self.Onehot_Normalized_Matrix[i-1][0][0])
            else:
                count+=1

        #get last sequence length
        if maxLen < count:
            maxLen = count
        self.sequencesLengths.append(count)
        self.collectors.append(self.Onehot_Normalized_Matrix[len(self.Onehot_Normalized_Matrix)-1][0][0])

        return maxLen


    #convert all data in the  data matrix to one hot vectors and normalized values
    def Generate_OneHot_Normalized_Matrix(self,FGC_Dic):

        #initialize a 3D matrix to be filled
        self.init_Onehot_Normalized_Matrix()

        #get indicies of some important columns using their names (do not worry a bout case - handle capital&small letters)
        COLLECTOR_KEY_Index = self.getColumnIndex('COLLECTOR_KEY')
        Date_Index=self.getColumnIndex('LAST_DATE')
        Target_Index = self.getColumnIndex('target_2')
        FGC_Index=self.getColumnIndex('FGC')

        #loop on all columns names to apply suitable functions depending on the column contents
        for columnIndex in range(len(self.columnsTags)):

            #do not add values of data or target
            if columnIndex==Date_Index or columnIndex==Target_Index:
                continue

            #add collector_key at first place
            elif columnIndex==COLLECTOR_KEY_Index:
                self.addColumnValuesAsIs(columnIndex,0)

            #add FGC as one hot vector in third (last) place
            elif columnIndex == FGC_Index:
                if FGC_Dic==0:
                    uniqueTags=self.getUniqueTags(columnIndex)
                    uniqueTags.append(0)
                    self.saveTags(uniqueTags,'FGC_Dictionary')  # save the FGC dictionary in a file to be used in evaluation representation
                else:
                    uniqueTags=FGC_Dic


                self.addColumnsValuesAsOneHot(columnIndex,2,uniqueTags)

            #add values in features (second place) onehot and normalized features
            else:
                try:
                    #normalize for numerical values (add normalized values in features)
                    float(self.dataMatrix[0][columnIndex])
                    mean,std=self.get_Mean_STD(columnIndex)
                    self.addColumnsValuesNormalized(columnIndex,1,mean,std)

                except ValueError:
                    #one hot for non-numerical vaules (add onhot values in features)
                    uniqueTags = self.getUniqueTags(columnIndex)
                    self.addColumnsValuesAsOneHot(columnIndex, 1, uniqueTags)


    #save unique tags of FGC in a file
    def saveTags(self,tags,file):
        #loop and save index,name of each FGC in a line
        with open(file,'w') as writer:
            for i in range(len(tags)):
                writer.write(str(i))
                writer.write('\t')
                writer.write(str(tags[i]))
                writer.write('\n')




    #get values at column Index and add them in their place by normalization(mean,std)
    def addColumnsValuesNormalized(self,columnIndex,vectorIndex,mean,std):
        for i in range(len(self.dataMatrix)):
            self.Onehot_Normalized_Matrix[i][vectorIndex].append\
            ((float(self.dataMatrix[i][columnIndex])-mean)/(std+1))

    #get mean and std of a column
    def get_Mean_STD(self,columnIndex):
        mat = np.array(self.dataMatrix)
        column = mat[:, columnIndex]
        column=np.array(column).astype(np.float)
        mean = column.mean()
        std = math.sqrt(column.var())
        return mean,std

    #get values at column index and add them in their place in one hot matrix
    def addColumnsValuesAsOneHot(self,columnIndex,vectorIndex,uniqueTags):
        for i in range(len(self.dataMatrix)):
            vec=[0]*len(uniqueTags)

            idx = len(uniqueTags) - 1

            if self.dataMatrix[i][columnIndex] in uniqueTags:
                idx=uniqueTags.index(self.dataMatrix[i][columnIndex])

            vec[idx]=1
            self.Onehot_Normalized_Matrix[i][vectorIndex].extend(vec)


    #get unique list of tags
    def getUniqueTags(self,columnIndex):
        unique=[]
        for i in range(len(self.dataMatrix)):
            if self.dataMatrix[i][columnIndex] not in unique:
                unique.append(self.dataMatrix[i][columnIndex])
        return unique


    #get values at column Index and add them in first array in one hot matrix
    def addColumnValuesAsIs(self,columnIndex,vectorIndex):
        for i in range(len(self.dataMatrix)):
            self.Onehot_Normalized_Matrix[i][vectorIndex].append\
            (self.dataMatrix[i][columnIndex])

    #put three empty vector in each row
    def init_Onehot_Normalized_Matrix(self):
        for i in range(len(self.dataMatrix)):
            row=[[],[],[]]
            self.Onehot_Normalized_Matrix.append(row)

    #sort Matrix - ascending with user - descending with time
    def SortMatrix(self):
        COLLECTOR_KEY_Index=self.getColumnIndex('COLLECTOR_KEY')
        DAYS_SINCE_LAST_VISIT_Index=self.getColumnIndex('DAYS_SINCE_LAST_VISIT')
        if DAYS_SINCE_LAST_VISIT_Index==-1:
            DAYS_SINCE_LAST_VISIT_Index = self.getColumnIndex('recency')

        #bubble sort
        for i in range(len(self.dataMatrix)):
            swaps=False
            for j in range(len(self.dataMatrix)-i-1):

                #sort on key asc and then on days desc
                if (float(self.dataMatrix[j][COLLECTOR_KEY_Index])>float(self.dataMatrix[j+1][COLLECTOR_KEY_Index]))\
                or \
                (float(self.dataMatrix[j][COLLECTOR_KEY_Index])==float(self.dataMatrix[j+1][COLLECTOR_KEY_Index])
                 and
                 float(self.dataMatrix[j][DAYS_SINCE_LAST_VISIT_Index]) < float(self.dataMatrix[j+1][DAYS_SINCE_LAST_VISIT_Index])):

                    a,b=j,j+1

                    self.dataMatrix[a],self.dataMatrix[b]\
                    =self.dataMatrix[b],self.dataMatrix[a]
                    swaps=True

            #if already sorted list (no more swaps done in a full iteration)
            if swaps==False:
                break



    #get index of the column name (0 based)
    def getColumnIndex(self,columnName):
        for i in range(len(self.columnsTags)):
            if self.columnsTags[i].upper()==columnName.upper():
                return i
        return -1


    #load data from file and close it
    def LoadFile(self,path,count):
        reader = open(path, mode='r')
        self.columnsTags = reader.readline().split(',')

        recordNum = 1

        #loop line by line
        for record in reader:

            record = record.strip()
            if record == '':
                continue

            recordNum += 1
            if count!=0 and recordNum>=count:
                break

            fields = record.split(',')
            for i in range(len(fields)):
                fields[i]=fields[i].strip()
                if fields[i]=='':
                    fields[i]=0

            #check that fileds are in the same length of columns names
            if len(fields) < len(self.columnsTags):
                print 'missing or extra fields in record (' , recordNum , ')'

            #update the matrix with new vector (fields vector as written in file)
            self.dataMatrix.append(fields)

        reader.close()
