In [3]:
"""
reference: 
    https://qiita.com/deaikei/items/df3626486986566cb65c 

IPW stands for 'Inverse Probability Weighting'
"""

"\nreference: \n    https://qiita.com/deaikei/items/df3626486986566cb65c \n\nIPW stands for 'Inverse Probability Weighting'\n"

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Generate data

In [5]:
age = ["{}代".format(i*10) for i in range(1, 8)]
gender = ["女性", "男性"]

In [19]:
def generate_users(n_users, **features):
    """make user list where columns are 'user_id' and feature columns ('age', 'gender',... etc)
       values of the feature columns are randomly chosen from the given list
    """
    user_id = [i + 1 for i in range(n_users)]
    df = pd.DataFrame({"user_id":user_id})

    np.random.seed(100)
    for col in features:
        feature = features[col]
        values = np.random.choice(feature, n_users)
        df[col] = values

    return df


def add_flag(df, **targets):
    """add flag column named as 'x_flag'
       bias is added to the users specified in the variables '**targets'
    """
    n_users = df.shape[0]

    # specify users to whom bias is added
    is_target = [True for _ in range(n_users)]
    for col in targets:
        target_values = targets[col]
        is_value = [False for _ in range(n_users)]    
        for value in target_values:
            tmp_bool = (df[col] == value)
            is_value = np.logical_or(is_value, tmp_bool)
        is_target = np.logical_and(is_target, is_value)

    #assign flag
    flag = [0, 1]
    #for ordinary users, 10% of them are assigned to 'flag=1'
    weight = [0.9, 0.1]
    np.random.seed(200)
    df["x_flag"] = np.random.choice(flag, n_users, p=weight)
    #for users specified above, 90% of them are assigned to 'flag=1'  
    weight = [0.1, 0.9]
    np.random.seed(300)
    df.loc[is_target, "x_flag"] = np.random.choice(flag, n_users, p=weight)

    return df


def add_rate(df, **targets):
    """add real valued column named as 'x_rate', where the values are normalized between 0 and 1
       for the users with 'flag=1', values in 'x_rate' are increased
       for the users specified by '**targets', values in 'x_rate' are much higher, regardless of 'flag=1 or 0'
    """
    n_users = df.shape[0]

    #specify users to whom high 'x_rate' values are set
    is_target = [True for _ in range(n_users)]
    for col in targets:
        target_values = targets[col]
        is_value = [False for _ in range(n_users)]    
        for value in target_values:
            tmp_bool = (df[col] == value)
            is_value = np.logical_or(is_value, tmp_bool)
        is_target = np.logical_and(is_target, is_value)

    #for users with 'flag=0', 'x_rate' values are sampled from Normal(0.2, 0.1)
    np.random.seed(200)
    df["x_rate"] = np.random.normal(0.2, 0.1, n_users)

    #for users with 'flag=1', 'x_rate' values are increased by 0.2 point
    is_flag = (df["x_flag"] == 1)
    df.loc[is_flag, "x_rate"] += 0.2

    #for users specified by '**targets', 'x_rate' values are sampled from Normal(0.8, 0.2)
    np.random.seed(300)
    df.loc[is_target, "x_rate"] = np.random.normal(0.8, 0.2, n_users)

    #for convenience normalize to [0 ,1]
    df.loc[df["x_rate"] > 1, "x_rate"] = 1    
    df.loc[df["x_rate"] < 0, "x_rate"] = 0

    return df

In [30]:
n_users = 100000
df = generate_users(n_users, age=age, gender=gender)
df.shape[0]

100000

In [32]:
len([True for _ in range(n_users)])

100000

In [33]:
df

Unnamed: 0,user_id,age,gender
0,1,10代,男性
1,2,10代,女性
2,3,40代,女性
3,4,10代,男性
4,5,30代,男性
5,6,70代,男性
6,7,50代,男性
7,8,30代,女性
8,9,60代,女性
9,10,30代,女性


In [34]:
flag = [0, 1]
weight = [0.9, 0.1]
df["test"] = np.random.choice(flag, n_users, p=weight)
df

Unnamed: 0,user_id,age,gender,test
0,1,10代,男性,0
1,2,10代,女性,0
2,3,40代,女性,1
3,4,10代,男性,1
4,5,30代,男性,0
5,6,70代,男性,1
6,7,50代,男性,0
7,8,30代,女性,0
8,9,60代,女性,0
9,10,30代,女性,0


In [25]:
df = add_flag(df, age=["20代", "30代"], gender=["男性"])

ValueError: shape mismatch: value array of shape (100000,) could not be broadcast to indexing result of shape (14182,)

In [26]:
df = add_rate(df, age=["20代", "30代"], gender=["男性"])

ValueError: shape mismatch: value array of shape (100000,) could not be broadcast to indexing result of shape (14182,)