# Michaels Model Report

In [89]:
import re
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
spark = SparkSession.builder.appName('app').getOrCreate()

Creating a list of model labels

In [90]:
def listOfModels(targets):
    _models = []
    for i in range(0, len(targets)):
        _models.append(targetColumns[i][0:3])
    return _models

* selecting features for each model.
* model columns will be contained in a list `n x 1`
* where `n` is the number of models to be built.
* The first entry in each row is the label for the target column.
* The rest of the entries will be the labels for the feature vectors.
* `featuresByModel` is a nested list containing feature columns for each category

In [91]:
def modelColumnCreator(_modelNames, _targets, _features):
    
    _featuresByModel = []
    for i in _modelNames: 
        midList = []
        midList = columnByModel(_targets, midList, i)
        midList = secondFeatureDeletion(_features, midList, i)
        _featuresByModel.append(midList)
    
    return _featuresByModel


In [92]:
def columnByModel(_targets, _midList, i):

    for j in range(0, len(_targets)):
            if _targets[j].find(i) != -1:
                _midList.append(_targets[j])
                break
            else:
                continue
    return _midList


*`listIntersect` is a simple function to check if there is a common element in two lists*

In [93]:
def listIntersect(list1, list2):
    _list1 = list1
    _list2 = list2
    for i in _list1:
        for j in _list2:
            if i==j:
                return True
    return False

In [94]:
def initialFeatureDeletion(_df):
    _df = _df.drop('LAST_SHOP_DT')
    _df = _df.drop('FIRST_SHOP_DT')
    _df = _df.drop('LAST_SHOP_DT1')
    _df = _df.drop('LAST_SHOP_IDNT')
    _df = _df.drop('INDIV_ID')
    _df = _df.drop('CA_VALID_EMAIL_FLG')
    return _df


In [95]:
def secondFeatureDeletion(_features, _midList, i):
    for j in range(0, len(_features)):
            if (_features[j].find(i) != -1) & (_features[j].find('APP') == -1) & (_features[j].find('WEB') == -1) & (_features[j].find('RECENCY') == -1):
                _midList.append(_features[j])
    return _midList

In [96]:
def targetColumnSelector(_columns):
    _columnsList = []
    for i in _columns:
        if re.search('D\d\d_IND$', i):
            _columnsList.append(i)
    return _columnsList

In [97]:
def featureColumnSelector(_columns):
    _columnsList = []
    for i in _columns:
        if not re.search('D\d\d_IND$', i):
            _columnsList.append(i)
    return _columnsList

In [98]:
def accumulateGuesses(_dict):
    _tempList = []
    for j in _dict.keys():
        if _dict[j][1] <= 0.5:
            _tempList.append([j, _dict[j][1]])
    return _tempList

In [99]:
def createFinalGuesses(_customer_dict, standardGuesses):
    _finalGuesses = {}
    for i in _customer_dict.keys(): #len(guessList)
        tempList = accumulateGuesses(_customer_dict[i])
        
        if len(tempList) < 5:
            tempList = padToFive(tempList, standardGuesses)
        
        _finalGuesses[i] = chooseGuesses(tempList)
    return _finalGuesses

In [100]:
def padToFive(_tempList, _standardGuesses):
    for cat in _standardGuesses:
        _itemList = [_item[0] for _item in _tempList]
        if cat not in _itemList:
            _tempList.append([cat, 0.5]) 
        if len(_tempList) == 5:
            break
    return _tempList

In [101]:
def chooseGuesses(_tempList):
    _tempList = sorted(_tempList ,key=lambda x: x[1])
    _tempList1 = [_item[0] for _item in _tempList]  
    return _tempList1[:5]

In [102]:
def createActualValues(_customer_dict):
    _actual = {}
    for i in _customer_dict.keys():
        _tempList = []
        for j in _customer_dict[i].keys():
            if _customer_dict[i][j][0] == 1:
                _tempList.append(j)
        _actual[i] = _tempList
    return _actual

In [103]:
def getFinalScore(_finalGuesses, _actual):
    total = len(actual)
    count = 0
    for i in finalGuesses.keys():
        listA = finalGuesses[i]
        listB = actual[i]
        if listIntersect(listA, listB):
            count += 1
    return (round(count/total, 3)*100)

In [104]:
df = spark.read.csv('10000.csv', header=True, inferSchema=True)

* `targetColumns` contains the 44 categories
* `featureColumns` contains the 1600 remaining columns in the dataset

In [105]:
df1 = initialFeatureDeletion(df)
columns = df1.columns
targetColumns = targetColumnSelector(columns)
featureColumns = featureColumnSelector(columns)

* creating a list of the model labels
* this is used to select features for each model

In [106]:
modelList = listOfModels(targetColumns)

* creating list of column names for each model
* first entry in each row is the target label, the rest are feature labels
* there is one row per model

In [107]:
featuresByModel = modelColumnCreator(modelList, targetColumns, featureColumns)

Splitting 80-20

In [108]:
train_df, test_df = df1.randomSplit((0.8, 0.2))

Fill the null values with 0

In [109]:
train_df = train_df.na.fill(0)
test_df = test_df.na.fill(0)

In [110]:
emailAndTarget = targetColumns
emailAndTarget.append('EMAIL_ID')
act_df = test_df.select(emailAndTarget).cache()

### Iterating over all the `featuresByModel`and training individual categories
* We cached the results regularly,this reduced the training time per model by 22 seconds. (2.3 times faster)

In [111]:
customer_dict = {}
emailID_cat = []
i = []
category = 1
for i in featuresByModel:
    print('Training ', i[0]," ,Category: ",category)
    i.append('EMAIL_ID')
    x = train_df.select(i).cache()
    y = test_df.select(i).cache()
    
    # run test data through vector assembler
    assembler = VectorAssembler(inputCols = i[1:-1], outputCol = 'features')
    train_assembler_output = assembler.transform(x)
    test_assembler_output = assembler.transform(y)
    
    trainModel = train_assembler_output.select('features', i[0])
    
    train_df_0 = trainModel[trainModel[i[0]] == 0]
    train_df_1 = trainModel[trainModel[i[0]] == 1]
    
    train_df_0 = train_df_0.sample(False, 0.15)
    
    trainModel = train_df_0.union(train_df_1)
    
    rfc = RandomForestClassifier(featuresCol = 'features', labelCol = i[0], maxDepth = 5, numTrees = 20)
    rfc_model = rfc.fit(trainModel)

    print('\tTesting ', i[0])

    testModel = test_assembler_output.select('features')
    emailID_cat = test_assembler_output.select('EMAIL_ID', i[0])
    rfc_prediction = rfc_model.transform(testModel)
    
    print('\tCollecting ', i[0])
    collected = rfc_prediction.collect()
    emailID_cat = emailID_cat.collect()
    print('\tCreating/Updating Dictionary..\n')
    for row1,row2 in zip(collected,emailID_cat):
        if row2['EMAIL_ID'] not in customer_dict:
            customer_dict[row2['EMAIL_ID']] = {}
            #Actual value, Probability vector, Predicted Value
            customer_dict[row2['EMAIL_ID']][i[0]] = [row2[i[0]], row1['probability'][0]]
        else:
            customer_dict[row2['EMAIL_ID']][i[0]] = [row2[i[0]], row1['probability'][0]]
    category+=1

Training  D18_IND  ,Category:  1
	Testing  D18_IND
	Collecting  D18_IND
	Creating/Updating Dictionary..

Training  D19_IND  ,Category:  2
	Testing  D19_IND
	Collecting  D19_IND
	Creating/Updating Dictionary..

Training  D20_IND  ,Category:  3
	Testing  D20_IND
	Collecting  D20_IND
	Creating/Updating Dictionary..

Training  D21_IND  ,Category:  4
	Testing  D21_IND
	Collecting  D21_IND
	Creating/Updating Dictionary..

Training  D22_IND  ,Category:  5
	Testing  D22_IND
	Collecting  D22_IND
	Creating/Updating Dictionary..

Training  D23_IND  ,Category:  6
	Testing  D23_IND
	Collecting  D23_IND
	Creating/Updating Dictionary..

Training  D24_IND  ,Category:  7
	Testing  D24_IND
	Collecting  D24_IND
	Creating/Updating Dictionary..

Training  D25_IND  ,Category:  8
	Testing  D25_IND
	Collecting  D25_IND
	Creating/Updating Dictionary..

Training  D26_IND  ,Category:  9
	Testing  D26_IND
	Collecting  D26_IND
	Creating/Updating Dictionary..

Training  D28_IND  ,Category:  10
	Testing  D28_IND
	Co

#### Making the list of `standardGuesses`

In [113]:
standardGuesses = ['D51_IND',
                   'D71_IND',
                   'D54_IND',
                   'D55_IND',
                   'D36_IND']

#### Extracting final guesses using `customer_dict` & `standardGuesses`

In [114]:
finalGuesses = createFinalGuesses(customer_dict, standardGuesses)

#### Extracting actual categories customer has shopped in

In [115]:
actual = createActualValues(customer_dict)

### Final Output:

In [116]:
finalScore = getFinalScore(finalGuesses, actual)
print('Final Score: \n', '\t', finalScore, '%')

Final Score: 
 	 60.4 %
