In [None]:
import json
from datetime import datetime
import kragle as kg
import sys
import pandas as pd
import math
import numpy as np
import scipy.stats as st
import statsmodels.stats.api as sms
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.ticker as ticker

In [None]:
LOAD_DATA_QUERY = '''
    SELECT
        profile_id,
        action,
        affliction__adversity,
        alternative_comedy,
        animated_series,
        anime,
        art__music,
        bollywood,
        business__finance,
        christian_docs,
        comedy, 
        docuseries,
        drama,
        english_standup,
        family_animation,
        family_film,
        foreignlanguage_mixed,
        formats__food,
        genre,
        history,
        horror,
        indian_regional_language,
        kids_live_action,
        kids_superhero,
        lgbtq,
        military,
        mysterythrillers,
        nonenglish_standup,
        other,
        other_kids,
        politics__power,
        preschool,
        romance,
        scifi,
        self_improvement,
        soaps,
        sports,
        the_natural_world,
        thriller
    FROM
        rmatai.profile_viewing_pivot_FR_new3
'''

load_data_query = (LOAD_DATA_QUERY)

print(load_data_query)

In [None]:
all_data = kg.genie.SparkSqlJob() \
    .script(load_data_query) \
    .version('2.3.2') \
    .job_name('StreamingBehavior') \
    .execute() \
    .pandas()
print('Extraction complete.')

In [None]:
all_data.head(5)

In [None]:
all_data.rename({0: 'profile_id', # profile_id should always be at index 0 for future logic to apply.
                 1: 'action',
                 2: 'affliction__adversity',
                 3: 'alternative_comedy',
                 4: 'animated_series',
                 5: 'anime',
                 6: 'art__music',
                 7: 'bollywood',
                 8: 'business__finance',
                 9: 'christian_docs',
                10: 'comedy',
                11: 'docuseries',
                12: 'drama',
                13: 'english_standup',
                14: 'family_animation',
                15: 'family_film',
                16: 'foreignlanguage_mixed',
                17: 'formats__food',
                18: 'genre',
                19: 'history',
                20: 'horror',
                21: 'indian_regional_language',
                22: 'kids_live_action',
                23: 'kids_superhero',
                24: 'lgbtq',
                25: 'military',
                26: 'mysterythrillers',
                27: 'nonenglish_standup',
                28: 'other',
                29: 'other_kids',
                30: 'politics__power',
                31: 'preschool',
                32: 'romance',
                33: 'scifi',
                34: 'self_improvement',
                35: 'soaps',
                36: 'sports',
                37: 'the_natural_world',
                38: 'thriller'},
                axis=1, inplace=True)

In [None]:
all_data.head(2)

In [None]:
ALL_DATA_W_O_PROFILE_ID = all_data.drop(['profile_id'], axis=1)
VERTICAL_LABELS = ALL_DATA_W_O_PROFILE_ID.columns
print(VERTICAL_LABELS)

In [None]:
most_watched_vertical_label = ALL_DATA_W_O_PROFILE_ID.idxmax(axis=1, skipna=True).to_numpy()
print(most_watched_vertical_label[0:5])

In [None]:
all_profile_ids = all_data['profile_id'].values
print(all_profile_ids[0:5])

In [None]:
write_to_file = np.concatenate((all_profile_ids.reshape(-1,1), most_watched_vertical_label.reshape(-1,1)), 
                               axis=1)
print(write_to_file[1:5])

In [None]:
np.savetxt('labels.txt', write_to_file, fmt='%d %s')