In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
PATH = '../data/SocialTalk'
SEPARATOR = ','
TABLE = 'Apres-profiles-SocialTalk.csv'

In [13]:
def convert(x):
    try:
        return float(x)
    except:
        return np.nan

In [19]:
df = pd.read_csv(f'{PATH}/{TABLE}', sep=SEPARATOR, low_memory=False).iloc[:, :32]
df = df.set_index('Username')
df = df.drop(['Name', 'Account', 'Birthdate'], axis=1)

In [20]:
log_normal_columns = [
    'Followers',
    'Posts',
    'Engagement',
    'Estimated reach',
    'Estimated impressions',
    'Avg. likes per post',
    'Avg. engagement per post',
    'Avg. comments per post',
    'Avg. views per video',
    'Cost per post (MIN)',
    'Cost per post (MAX)',
    'Post CPE (MIN)', 
    'Post CPE (MAX)',
    'Cost per story (MIN)', 
    'Cost per story (MAX)',
    'Story CPE (MIN)', 
    'Story CPE (MAX)'
]
for current_column in log_normal_columns:
    temp = df.loc[df[current_column] > 0, current_column]
    temp = temp.apply(np.log)
    temp = (temp - temp.mean()) / temp.std()
    df.loc[df[current_column] > 0, current_column] = temp
    df.loc[df[current_column] == 0, current_column] = np.nan

In [21]:
def post_cpm_min(x):
    if np.isnan(x):
        result = np.nan
    elif x < 24:
        result = 'very low'
    elif x < 25:
        result = 'low'
    elif x < 26:
        result = 'high'
    else:
        result = 'very high'

    return result

def post_cpm_max(x):
    if np.isnan(x):
        result = np.nan
    elif x < 29:
        result = 'low'
    elif x < 31:
        result = 'moderate'
    elif x < 34:
        result = 'high'
    else:
        result = 'very high'

    return result

def story_cpm_min(x):
    if np.isnan(x):
        result = np.nan
    elif x < 10:
        result = 'low'
    elif x < 20:
        result = 'moderate'
    else:
        result = 'high'

    return result

def story_cpm_max(x):
    if np.isnan(x):
        result = np.nan
    elif x < 10:
        result = 'low'
    elif x < 25:
        result = 'moderate'
    elif x < 33:
        result = 'high'
    else:
        result = 'very high'

    return result

def post_story_cpe_minmax(x):
    if np.isnan(x):
        result = np.nan
    elif x < 1:
        result = 'low'
    else:
        result = 'high'

    return result

def cost_per_story_minmax(x):
    if np.isnan(x):
        result = np.nan
    elif x < 3:
        result = 'low'
    elif x < 6:
        result = 'moderate'
    else:
        result = 'high'

    return result

def following_counts(x):
    if np.isnan(x):
        result = np.nan
    elif x < 500:
        result = 'very low'
    elif x < 1000:
        result = 'low'
    elif x < 2000:
        result = 'moderate'
    elif x < 5000:
        result = 'high'
    else:
        result = 'very high'

    return result

def avg_posts_per_period(x):
    if np.isnan(x):
        result = np.nan
    elif x < 5:
        result = 'low'
    elif x < 20:
        result = 'moderate'
    else:
        result = 'high'

    return result

def bucketed_norm(x):
    if np.isnan(x):
        result = np.nan
    elif x < -2:
        result = 'very low'
    elif x < -1:
        result = 'low'
    elif x < 1:
        result = 'moderate'
    elif x < 1:
        result = 'high'
    else:
        result = 'very high'

    return result

In [22]:
transforms = {
    'Followers': bucketed_norm,
    'Posts': bucketed_norm,
    'Engagement': bucketed_norm,
    'Estimated reach': bucketed_norm,
    'Estimated impressions': bucketed_norm,
    'Avg. likes per post': bucketed_norm,
    'Avg. engagement per post': bucketed_norm,
    'Avg. comments per post': bucketed_norm,
    'Avg. views per video': bucketed_norm,
    'Cost per post (MIN)': bucketed_norm,
    'Cost per post (MAX)': bucketed_norm,
    'Following': following_counts,
    'Avg. posts per week': avg_posts_per_period,
    'Avg. posts per month': avg_posts_per_period,
    'Post CPM (MIN)': post_cpm_min, 
    'Post CPM (MAX)': post_cpm_max, 
    'Post CPE (MIN)': post_story_cpe_minmax, 
    'Post CPE (MAX)': post_story_cpe_minmax,
    'Cost per story (MIN)': cost_per_story_minmax, 
    'Cost per story (MAX)': cost_per_story_minmax, 
    'Story CPM (MIN)': story_cpm_min,
    'Story CPM (MAX)': story_cpm_max, 
    'Story CPE (MIN)': post_story_cpe_minmax, 
    'Story CPE (MAX)': post_story_cpe_minmax
}

In [23]:
for column, transform_func in transforms.items():
    df[column] = df[column].apply(transform_func)

In [24]:
PATH = '../data/SocialTalk/predict'
x = df.reset_index()
x.columns = ['user_id'] + x.columns.tolist()[1:]
x.to_csv(f'{PATH}/categorical_buckets.csv', index=None)


In [10]:
# # WHAT IS THE HEURISTIC HERE (for turning "noisy" numerics into categoricals for graph representation)
# - if the dataset has any "categorical in numeric" values (e.g. zero, NaN, etc), treat these separately
# - for the numeric values, check whether the distribution is more like exponential or normal (log likelihood)
# - if the distribution is more like exponential, apply the log transform (this can also be done with box-cox)
# - if the distribution is unimodal, split the distribution by percentiles into categories (e.g. low, med high)
# - if the distribution is multi-modal, use kernel density estimation to identify the modes, then create splits in between them (assuming gaussians)
# NOTE: in some cases, clients may ONLY have numeric data.... this does not allow the graph to create enough relationships to "beat" traditional machine learning
# NOTE: the approach enables us to transform numeric data into categorical (in order to establish these relationships, when appropriate)

# log columns: 4, 5, 8, 9
# big-bucket columns: 0, 1, 6, 7
# category-in-number columns: 4, 5

# column = columns_of_interest[8]
# is_rounded = False
# is_log = False
# data = df[df[column] > 0][column]

# if is_log:
#     data = data.apply(np.log)

# if is_rounded:
#     data = np.round(data, 5)

# data.hist(bins=100)
# plt.show()

# data.value_counts().reset_index(drop=True).plot()
# plt.show()