In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from datetime import datetime

In [None]:
path = '/Users/sanjayagrawal/Downloads/'
data1 = pd.read_csv(path + 'instacart_data.csv')
products = data1['product_id'].unique()

products_lookup = pd.DataFrame(products)
products_lookup.columns = ['product_id']
products_lookup = products_lookup.sort_values('product_id').reset_index(drop=True)
products_lookup['index'] = ['product'+str(i) for i in range(1, len(products) + 1)]
products_index = list(products_lookup['index'])

data1 = data1.merge(products_lookup, on = 'product_id', how = 'inner').drop('product_id', axis=1)

# data1.head()
# print (products_lookup)
# print (products_index)


In [None]:
data = data1.pivot(index = 'order_id', columns = 'index', values = 'prediction').reset_index().fillna(0)
data['dict'] = data.apply(lambda x : list(x[products_index]), axis=1)

In [None]:
data['dict1'] = data['dict'].map(lambda x : {products_index[i]:x[i] for i in range(len(products_index)) if x[i]>0})
data.head(2)

In [None]:
class F1Optimizer():
    def __init__(self):
        pass

    @staticmethod
    def get_expectations(P, pNone=None):
        expectations = []
        P = np.sort(P)[::-1]

        n = np.array(P).shape[0]
        DP_C = np.zeros((n + 2, n + 1))
        if pNone is None:
            pNone = (1.0 - P).prod()

        DP_C[0][0] = 1.0
        for j in range(1, n):
            DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]

        for i in range(1, n + 1):
            DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
            for j in range(i + 1, n + 1):
                DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]

        DP_S = np.zeros((2 * n + 1,))
        DP_SNone = np.zeros((2 * n + 1,))
        for i in range(1, 2 * n + 1):
            DP_S[i] = 1. / (1. * i)
            DP_SNone[i] = 1. / (1. * i + 1)
        for k in range(n + 1)[::-1]:
            f1 = 0
            f1None = 0
            for k1 in range(n + 1):
                f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
                f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
            for i in range(1, 2 * k - 1):
                DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
                DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
            expectations.append([f1None + 2 * pNone / (2 + k), f1])

        return np.array(expectations[::-1]).T

    @staticmethod
    def maximize_expectation(P, pNone=None):
        expectations = F1Optimizer.get_expectations(P, pNone)

        ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
        max_f1 = expectations[ix_max]

        predNone = True if ix_max[0] == 0 else False
        best_k = ix_max[1]

        return best_k, predNone, max_f1

    @staticmethod
    def _F1(tp, fp, fn):
        return 2 * tp / (2 * tp + fp + fn)

    @staticmethod
    def _Fbeta(tp, fp, fn, beta=1.0):
        beta_squared = beta ** 2
        return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)


def print_best_prediction(Probs, pNone=None):
#     print("Maximize F1-Expectation")
#     print("=" * 23)
#     print (Probs)
#     P = np.sort(P)[::-1]
#     n = P.shape[0]
#     L = ['L{}'.format(i + 1) for i in range(n)]
    k = sorted([(o,s) for o,s in Probs.items()], key = lambda x : x[1], reverse = True)
    L = [off for off, score in k]
    P = [score for off, score in k]
    n = len(L)
    P = np.array(P)

    if pNone is None:
#         print("Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)")
        pNone = (1.0 - P).prod()

    PL = ['p({}|x)={}'.format(l, p) for l, p in zip(L, P)]
#     print("Posteriors: {} (n={})".format(PL, n))
#     print("p(None|x)={}".format(pNone))

    opt = F1Optimizer.maximize_expectation(P, pNone)
    best_prediction = ['None'] if opt[1] else []
    best_prediction += (L[:opt[0]])
    f1_max = opt[2]

#     print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max))
    return best_prediction

if __name__ == '__main__':
    
    data['products'] = data.dict1.map(print_best_prediction)
    
#     probs = {"A":0.15, "B":0.1, "C":0.2, "D":0.5}
#     print_best_prediction(probs)





In [None]:
data.head(2)

In [None]:
data = data[['order_id', 'products']]
# data['order_id'] = data['order_id'].map(lambda x : x.replace('[',"").replace(']',""))
data.head()
data.to_csv(path + 'final.csv', index=False)


In [None]:
# for index1, rows in products_lookup.iterrows():
#     product_id = row[0]
#     index = row[1]
#     for i in data.

In [None]:
# for index, rows in data.iterrows():
#     best_case = row[1]
    