In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import random
import sys
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df1 = pd.read_csv('../input/netflix-prize-data/combined_data_1.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

In [None]:
print(df1.shape)
df1.head(3)

In [None]:
start_time = time.time()

p = df1.groupby('Cust_Id')['Cust_Id'].agg(['count'])

print("--- Pandas' groupby took %s seconds ---" % (time.time() - start_time))
print("--- Pandas' groupby took %s space ---" % str(p.shape))

In [None]:
p.head(3)

In [None]:
# start_time = time.time()

# for index, row in df1[:10000000].iterrows():
#     if index % 500000 == 0:
#         print("Saw %d elements" % index)
    
    
# print("--- Pandas' iterrows took %s seconds ---" % (time.time() - start_time))

In [None]:
class Morris:
    def __init__(self):
        self.X = 0
        
    def increment(self):
        p = np.random.rand()
        if p < 2**(-self.X):
            self.X += 1

    def estimate(self):
        return 2**self.X - 1

In [None]:
class MorrisBeta:
    def __init__(self, number_of_copies=10):
        self.X = [Morris()]*number_of_copies
        
    def increment(self):
        for m in self.X:
            m.increment()

    def estimate(self):
        estimator = 0
        for m in self.X:
            estimator += m.estimate()

        return estimator/len(self.X)

In [None]:
 def run_morris(number_of_copies=1):
    start_time = time.time()

    mb = MorrisBeta(number_of_copies)

    for index, row in df1[:1000000].iterrows():
        mb.increment()
        if index % 100000 == 0:
            print("Saw %d elements" % index)

    print("Morris Beta estimated %d elements" % mb.estimate()) 
    print("Morris took %s seconds" % (time.time() - start_time))
    return mb.estimate()

In [None]:
class FM:
    def __init__(self, salt=17):
        self.min_hash = 1
        self.salt = str(salt)
    
    def add_element(self, element):
        self.min_hash = min(self.min_hash, self.naive_hash(element))
        
    def estimate(self):
        return 1/self.min_hash
        
    def naive_hash(self, element):
        return random.Random(element+self.salt).random()

In [None]:
 def run_fm(salt):
    start_time = time.time()

    fm = FM(salt)

    for index, row in df1[:1000000].iterrows():
        fm.add_element(row['Cust_Id'])
        if index % 100000 == 0:
            print("Saw %d elements" % index)

    print("Flajolet-Martin Alpha estimated %d elements" % fm.estimate()) 
    print("Flajolet-Martin Alpha took %s seconds" % (time.time() - start_time))
    return fm.estimate()

In [None]:
class FMBeta:
    def __init__(self, number_of_copies=10, salt=0):
        self.X = [FM(str(i+salt*number_of_copies)) for i in range(number_of_copies)]
        
    def add_element(self, element):
        for m in self.X:
            m.add_element(element)

    def estimate(self):
        estimator = 0
        for m in self.X:
            estimator += m.estimate()

        return estimator/len(self.X)

In [None]:
 def run_fm_beta(number_of_copies=10):
    start_time = time.time()

    fm = FMBeta(number_of_copies)

    for index, row in df1[:1000000].iterrows():
        fm.add_element(row['Cust_Id'])
        if index % 100000 == 0:
            print("Saw %d elements" % index)

    print("Flajolet-Martin Beta estimated %d elements" % fm.estimate()) 
    print("Flajolet-Martin Beta took %s seconds" % (time.time() - start_time))

In [None]:
class FMFinal:
    def __init__(self, number_of_copies=10, number_of_beta_copies=10):
        self.X = [FMBeta(number_of_beta_copies,i) for i in range(number_of_copies)]
        
    def add_element(self, element):
        for m in self.X:
            m.add_element(element)

    def estimate(self):
        estimators = []
        for m in self.X:
            estimators.append(m.estimate())

        return np.median(estimators)

In [None]:
 def run_fm_final(number_of_copies=10, number_of_beta_copies=10):
    start_time = time.time()

    fm = FMFinal(number_of_copies, number_of_beta_copies)

    for index, row in df1[:1000000].iterrows():
        fm.add_element(row['Cust_Id'])
        if index % 100000 == 0:
            print("Saw %d elements" % index)

    print("Flajolet-Martin Final estimated %d elements" % fm.estimate()) 
    print("Flajolet-Martin Final took %s seconds" % (time.time() - start_time))

In [None]:
def get_alpha_results(times):
    results = []
    for i in range(times):
        np.random.seed(i)
        morris_estimate = run_morris()
        fm_estimate = run_fm(i)
        results.append((morris_estimate, fm_estimate))
        print((morris_estimate, fm_estimate))
    return results

In [None]:
alpha_results = get_alpha_results(100)
alpha_results

In [None]:
alpha_df = pd.DataFrame(data=alpha_results, columns=['Morris', 'FM'])
alpha_df

In [None]:
print("Morris Alpha estimated %d elements" % np.mean(alpha_df['Morris']))
print("Flajolet-Martin Alpha estimated %d elements" % np.mean(alpha_df['FM']))

In [None]:
print("Morris Alpha var is %d " % np.var(alpha_df['Morris']/10**6))
print("Flajolet-Martin Alpha var is %d " % np.var(alpha_df['FM']/10**6))

In [None]:
def get_beta_results(times):
    results_morris = [[0] for i in range(times)]
    results_fm = [[0] for i in range(times)]
    beta_params = [10, 50, 100]

    for i in range(times):

        for copies in beta_params:
            results_morris[i].append(run_morris(copies))
            results_fm[i].append(run_fm_beta(copies))
    
    return (results_morris, results_fm)

In [None]:
beta_results = get_beta_results(10)
beta_results

In [None]:
beta_df = pd.DataFrame(data=alpha_results, columns=['Morris', 'FM'])
beta_df

In [None]:
sns.scatterplot(data=beta_df)

In [None]:
def get_final_results(times):
    results_fm = [[0] for i in range(times)]
    beta_params = [10, 50, 100]
    fm_final_params = [10, 50]

    for i in range(times):

        for copies in fm_final_params:
            for beta_copies in beta_params:
                results_fm[i].append(run_fm_final(copies, beta_copies))
    
    return results_fm

In [None]:
final_results = get_final_results(10)
final_results

In [None]:
final_df = pd.DataFrame(data=final_results, columns=['Morris', 'FM'])
final_df