In [9]:
from google.cloud import bigquery, storage
from datetime import datetime, timedelta
import pathlib
from sklearn.model_selection import train_test_split
import os
import shutil
from random import randint
import pandas as pd
from google.cloud.exceptions import NotFound
from utils.download_from_GCP import download_table_to_local_as_one_file
from utils.read_sql_as_string import readSqlFile
from utils.simple_logging import get_standard_logger
import ipynbname

In [10]:
nb_name = ipynbname.name()
logger = get_standard_logger(nb_name, file_path=f"logs/{nb_name}", 
                             overwrite_file=True, stream=True)
log_print = logger.info
log_print("------------ Fresh Run -------------")

------------ Fresh Run -------------


level set


In [11]:
client = bigquery.Client(project="sharechat-production")



In [12]:
LANGS = [
    "Hindi",
    # "Tamil",
    # "Telugu",
#     "Kannada",
    "Punjabi",
#     "Odia",
#     "Bengali",
#     "Marathi",
#     "Malayalam",
#     "Gujarati",
]
# DAYS_OF_DATA_CONSIDERED = 7
TRAINING_DAYS = 30
TESTING_DAYS = 3
rating_def_dict = {
#         "vplay": "is_vp_succ",
        "like": "is_like",
#         "share": "is_share",
#         "fav": "is_fav",
#         "vplay_skip": "is_vp_skip",
        "vplay2": "is_vp_succ2",
    }
BASE_BIG_QUERY_PATH = "maximal-furnace-783.rohitrr"
RANDOM_SEED = 9745
TEST_DATA_FILE_NAME = "test.txt"
TRAIN_DATA_FILE_NAME = "train.txt"
USER_CONTEXT = "price" #"location"
DTYPE="video"
QUERY_BASE_FOLDER_PATH = f"./queries/{DTYPE}/"
OVERWRITE_BASE_TABLE=True

In [13]:
def delete_tables(delete_tables_path_list):
    for delete_table_path in delete_tables_path_list:
        client.delete_table(delete_table_path)

    log_print("All tables deleted")
    
def construct_base_table(lang, common_posts_end_time, 
                         common_posts_days, end_time, days,
                         overwrite_base_table = False, 
                         mode = "train"):
    
    temp_q0_table_path = BASE_BIG_QUERY_PATH+'.'+\
    f'{DTYPE}_{mode}_temp_q0_table_{lang}_{common_end_time.date()}_{TRAINING_DAYS}'
    
    if(not overwrite_base_table):
        try:
            client.get_table(temp_q0_table_path)
            log_print(f"Table-{temp_q0_table_path} already exists, not overwriting")
            return temp_q0_table_path
        except NotFound:
            log_print(f"Table-{temp_q0_table_path} not already present - going ahead creating it")
            
    log_print(f"Running query 0 for {lang} .....")
    job_config = bigquery.QueryJobConfig(destination= temp_q0_table_path,
                                         write_disposition = "WRITE_TRUNCATE"
                                         )
    sql = readSqlFile(f"{QUERY_BASE_FOLDER_PATH}/query0.sql", lang = lang,
                      common_posts_end_time = common_posts_end_time,
                      common_posts_days = common_posts_days,
                      end_time=end_time, days = days)
    query_job = client.query(sql, job_config=job_config)
    query_job.result()
    log_print(f"Query 0 results loaded to the table {temp_q0_table_path}")
    return temp_q0_table_path

In [14]:
def collect_and_prepare_train_data_with_base_table(lang, rating_def, user_context, base_q0_table_path,
                                             save_path):
    delete_tables = []
    # Run Q1 query
    temp_q0_table_path = base_q0_table_path
    temp_q1_table_path = BASE_BIG_QUERY_PATH+'.'+f'{USER_CONTEXT}_{DTYPE}_train_temp_q1_table_{lang}_{rating_def}'
    job_config = bigquery.QueryJobConfig(destination= temp_q1_table_path,
                                         write_disposition = "WRITE_TRUNCATE"
                                         )
    sql = readSqlFile(f"{QUERY_BASE_FOLDER_PATH}/{user_context}/query1.sql", lang = lang, 
                      rating_def = rating_def,
                     q0_table = temp_q0_table_path)
    query_job = client.query(sql, job_config=job_config)
    query_job.result()
    log_print(f"Query 1 results loaded to the table {temp_q1_table_path}")
    train_table_with_valid_user_post_ids = temp_q1_table_path
    delete_tables.append(temp_q1_table_path)

    # Run Q2 query
    table_with_mapping = BASE_BIG_QUERY_PATH+'.'+f'{USER_CONTEXT}_{DTYPE}_train_temp_q2_table_{lang}_{rating_def}'
    job_config = bigquery.QueryJobConfig(destination= table_with_mapping, 
                                         write_disposition = "WRITE_TRUNCATE")
    sql = readSqlFile(f"{QUERY_BASE_FOLDER_PATH}/{user_context}/query2.sql", lang = lang, 
                      rating_def = rating_def,
                     q1_table = temp_q1_table_path)
    query_job = client.query(sql, job_config=job_config)
    query_job.result()
    log_print(f"Query 2 results loaded to the table {table_with_mapping}")
    download_table_to_local_as_one_file(table_with_mapping, save_path, 
                                out_file_name = f"user_post_ffm_mapping.csv")
    delete_tables.append(table_with_mapping)

    # Run Q3 query
    temp_q3_table_path = BASE_BIG_QUERY_PATH+'.'+f'{USER_CONTEXT}_{DTYPE}_train_temp_q3_table_{lang}_{rating_def}'
    job_config = bigquery.QueryJobConfig(destination= temp_q3_table_path,
                                         write_disposition = "WRITE_TRUNCATE"
                                         )
    sql = readSqlFile(f"{QUERY_BASE_FOLDER_PATH}/{user_context}/query3.sql", lang = lang, 
                      rating_def = rating_def,
                      q1_table = temp_q1_table_path,
                     q2_table = table_with_mapping)
    query_job = client.query(sql, job_config=job_config)
    query_job.result()
    log_print(f"Query 3 results loaded to the table {temp_q3_table_path}")
    delete_tables.append(temp_q3_table_path)

#     Save results to local storage
    download_table_to_local_as_one_file(temp_q3_table_path, save_path, with_header=False,
                                        out_file_name = f"{TRAIN_DATA_FILE_NAME}")
    return delete_tables, train_table_with_valid_user_post_ids, table_with_mapping

def collect_and_prepare_test_data_with_base_table(lang, rating_def, user_context, base_q0_table_path, 
                                             table_with_valid_user_post_ids,
                                            table_with_mapping,
                                             save_path):
    delete_tables = []
    # Run Q1 query
    temp_q0_table_path = base_q0_table_path
    temp_q1_table_path = BASE_BIG_QUERY_PATH+'.'+ \
    f'{USER_CONTEXT}_{DTYPE}_test_temp_q1_table_{lang}_{rating_def}'
    job_config = bigquery.QueryJobConfig(destination= temp_q1_table_path,
                                         write_disposition = "WRITE_TRUNCATE"
                                         )
    # Changed query from the train case
    sql = readSqlFile(f"{QUERY_BASE_FOLDER_PATH}/{user_context}/test_query1.sql", lang = lang, 
                      rating_def = rating_def,
                     q0_table = temp_q0_table_path,
                     train_q1_table = table_with_valid_user_post_ids)
    query_job = client.query(sql, job_config=job_config)
    query_job.result()
    log_print(f"Query 1 results loaded to the table {temp_q1_table_path}")
    delete_tables.append(temp_q1_table_path)
    
    # Run Q3 query
    temp_q3_table_path = BASE_BIG_QUERY_PATH+'.'+f'{USER_CONTEXT}_{DTYPE}_test_temp_q3_table_{lang}_{rating_def}'
    job_config = bigquery.QueryJobConfig(destination= temp_q3_table_path,
                                         write_disposition = "WRITE_TRUNCATE"
                                         )
    sql = readSqlFile(f"{QUERY_BASE_FOLDER_PATH}/{user_context}/query3.sql", lang = lang, 
                      rating_def = rating_def,
                      q1_table = temp_q1_table_path,
                     q2_table = table_with_mapping)
    query_job = client.query(sql, job_config=job_config)
    query_job.result()
    log_print(f"Query 3 results loaded to the table {temp_q3_table_path}")
    delete_tables.append(temp_q3_table_path)

#     Save results to local storage
    download_table_to_local_as_one_file(temp_q3_table_path, save_path, with_header=False,
                                        out_file_name = f"{TEST_DATA_FILE_NAME}")
    return delete_tables

In [15]:
common_end_time = datetime(2021, 4, 30) # the hours, minutes and seconds are taken to be 0
test_end_time = common_end_time
train_end_time = common_end_time - timedelta(TESTING_DAYS)
log_print(test_end_time)
log_print(train_end_time)

2021-04-30 00:00:00
2021-04-27 00:00:00


In [None]:
%%time
try:
    for lang in LANGS:
        base_q0_train_table_path = construct_base_table(lang, train_end_time, TRAINING_DAYS, 
                                                        train_end_time, TRAINING_DAYS,
                                                 overwrite_base_table=OVERWRITE_BASE_TABLE, mode="train")

        base_q0_test_table_path = construct_base_table(lang, train_end_time, TRAINING_DAYS, 
                                                 test_end_time, TESTING_DAYS,
                                                 overwrite_base_table=OVERWRITE_BASE_TABLE, mode="test")

        for key, rating_def in rating_def_dict.items():
            save_path = f"./train_test_data_models/{USER_CONTEXT}/{DTYPE}/{lang}/{rating_def}"
            train_delete_table_paths, table_with_valid_user_post_ids, train_table_with_mapping = \
            collect_and_prepare_train_data_with_base_table(lang, rating_def, USER_CONTEXT,
                                     base_q0_train_table_path, save_path)

            test_delete_table_paths = \
            collect_and_prepare_test_data_with_base_table(lang, rating_def, USER_CONTEXT,
                                     base_q0_test_table_path,
                                     table_with_valid_user_post_ids,
                                     train_table_with_mapping,
                                     save_path
                                     )
    #         Delete all created tables
    #         delete_tables(
    #             train_delete_table_paths+test_delete_table_paths
    #         )
    #         Train using xlearn binary
            log_print(f"Training started for label {rating_def} in {lang} .......")
            model_output_path = os.path.join(save_path, "out")
            pathlib.Path(model_output_path).mkdir(parents = True, exist_ok = True)
            cmd = f"./xlearn_train {save_path}/{TRAIN_DATA_FILE_NAME} \
            -v {save_path}/{TEST_DATA_FILE_NAME} -x auc -s 2 -k 32 -m {model_output_path}/model.out \
            -t {model_output_path}/model.txt -b 0.001 --disk 2>&1 | tee \
            {model_output_path}/logs"
            os.system(cmd)
            log_print(f"Model trained and saved in {model_output_path}")
except Exception as e:
    logger.error(e)

Running query 0 for Hindi .....
Query 0 results loaded to the table maximal-furnace-783.rohitrr.video_train_temp_q0_table_Hindi_2021-04-30_30
Running query 0 for Hindi .....
Query 0 results loaded to the table maximal-furnace-783.rohitrr.video_test_temp_q0_table_Hindi_2021-04-30_30
Query 1 results loaded to the table maximal-furnace-783.rohitrr.price_video_train_temp_q1_table_Hindi_is_like
Query 2 results loaded to the table maximal-furnace-783.rohitrr.price_video_train_temp_q2_table_Hindi_is_like


Downloading table - maximal-furnace-783.rohitrr.price_video_train_temp_q2_table_Hindi_is_like to gcs




Exported maximal-furnace-783:rohitrr.price_video_train_temp_q2_table_Hindi_is_like to gs://query_runner_results/price_video_train_temp_q2_table_Hindi_is_like_2021-06-18_06:49:44_158/*.csv
Downloading from gcs_folder_name price_video_train_temp_q2_table_Hindi_is_like_2021-06-18_06:49:44_158 to local




Contents in gs://query_runner_results/price_video_train_temp_q2_table_Hindi_is_like_2021-06-18_06:49:44_158     transferred to ./train_test_data_models/price/video/Hindi/is_like/temp_download_folder
Merging and saving files from ./train_test_data_models/price/video/Hindi/is_like/temp_download_folder to ./train_test_data_models/price/video/Hindi/is_like
Saved file user_post_ffm_mapping.csv in ./train_test_data_models/price/video/Hindi/is_like


Query 3 results loaded to the table maximal-furnace-783.rohitrr.price_video_train_temp_q3_table_Hindi_is_like


Downloading table - maximal-furnace-783.rohitrr.price_video_train_temp_q3_table_Hindi_is_like to gcs




Exported maximal-furnace-783:rohitrr.price_video_train_temp_q3_table_Hindi_is_like to gs://query_runner_results/price_video_train_temp_q3_table_Hindi_is_like_2021-06-18_07:11:16_77/*.csv
Downloading from gcs_folder_name price_video_train_temp_q3_table_Hindi_is_like_2021-06-18_07:11:16_77 to local




Contents in gs://query_runner_results/price_video_train_temp_q3_table_Hindi_is_like_2021-06-18_07:11:16_77     transferred to ./train_test_data_models/price/video/Hindi/is_like/temp_download_folder
Merging and saving files from ./train_test_data_models/price/video/Hindi/is_like/temp_download_folder to ./train_test_data_models/price/video/Hindi/is_like
Saved file train.txt in ./train_test_data_models/price/video/Hindi/is_like


Query 1 results loaded to the table maximal-furnace-783.rohitrr.price_video_test_temp_q1_table_Hindi_is_like
Query 3 results loaded to the table maximal-furnace-783.rohitrr.price_video_test_temp_q3_table_Hindi_is_like


Downloading table - maximal-furnace-783.rohitrr.price_video_test_temp_q3_table_Hindi_is_like to gcs




Exported maximal-furnace-783:rohitrr.price_video_test_temp_q3_table_Hindi_is_like to gs://query_runner_results/price_video_test_temp_q3_table_Hindi_is_like_2021-06-18_08:37:51_67/*.csv
Downloading from gcs_folder_name price_video_test_temp_q3_table_Hindi_is_like_2021-06-18_08:37:51_67 to local




Contents in gs://query_runner_results/price_video_test_temp_q3_table_Hindi_is_like_2021-06-18_08:37:51_67     transferred to ./train_test_data_models/price/video/Hindi/is_like/temp_download_folder
Merging and saving files from ./train_test_data_models/price/video/Hindi/is_like/temp_download_folder to ./train_test_data_models/price/video/Hindi/is_like
Saved file test.txt in ./train_test_data_models/price/video/Hindi/is_like


Training started for label is_like in Hindi .......
Model trained and saved in ./train_test_data_models/price/video/Hindi/is_like/out
Query 1 results loaded to the table maximal-furnace-783.rohitrr.price_video_train_temp_q1_table_Hindi_is_vp_succ2
Query 2 results loaded to the table maximal-furnace-783.rohitrr.price_video_train_temp_q2_table_Hindi_is_vp_succ2


Downloading table - maximal-furnace-783.rohitrr.price_video_train_temp_q2_table_Hindi_is_vp_succ2 to gcs




Exported maximal-furnace-783:rohitrr.price_video_train_temp_q2_table_Hindi_is_vp_succ2 to gs://query_runner_results/price_video_train_temp_q2_table_Hindi_is_vp_succ2_2021-06-18_14:24:43_26/*.csv
Downloading from gcs_folder_name price_video_train_temp_q2_table_Hindi_is_vp_succ2_2021-06-18_14:24:43_26 to local




Contents in gs://query_runner_results/price_video_train_temp_q2_table_Hindi_is_vp_succ2_2021-06-18_14:24:43_26     transferred to ./train_test_data_models/price/video/Hindi/is_vp_succ2/temp_download_folder
Merging and saving files from ./train_test_data_models/price/video/Hindi/is_vp_succ2/temp_download_folder to ./train_test_data_models/price/video/Hindi/is_vp_succ2
Saved file user_post_ffm_mapping.csv in ./train_test_data_models/price/video/Hindi/is_vp_succ2


Query 3 results loaded to the table maximal-furnace-783.rohitrr.price_video_train_temp_q3_table_Hindi_is_vp_succ2


Downloading table - maximal-furnace-783.rohitrr.price_video_train_temp_q3_table_Hindi_is_vp_succ2 to gcs




Exported maximal-furnace-783:rohitrr.price_video_train_temp_q3_table_Hindi_is_vp_succ2 to gs://query_runner_results/price_video_train_temp_q3_table_Hindi_is_vp_succ2_2021-06-18_14:57:38_26/*.csv
Downloading from gcs_folder_name price_video_train_temp_q3_table_Hindi_is_vp_succ2_2021-06-18_14:57:38_26 to local




In [19]:
logger.error("check error")

check error


In [27]:
model_output_path = os.path.join(save_path, "out")
pathlib.Path(model_output_path).mkdir(parents = True, exist_ok = True)
cmd = f"./xlearn_train {save_path}/{TRAIN_DATA_FILE_NAME} \
-v {save_path}/{TRAIN_DATA_FILE_NAME} -x auc -s 2 -k 32 -m {model_output_path}/model.out \
-t {model_output_path}/model.txt -b 0.001 --disk 2>&1 | tee \
{model_output_path}/logs"
os.system(cmd)
log_print(f"Model trained and saved in {model_output_path}")

NameError: name 'save_path' is not defined

### Scrap code

In [21]:
sql = readSqlFile("./queries/video/query0.sql", 
                  lang = "Odia", rating_def = rating_def, 
                  end_time=end_time)

In [32]:
rating_def = "is_vp_succ"

In [34]:
log_print(f"Training started for label {rating_def} in {lang} .......")
model_output_path = os.path.join(save_path, "out")
pathlib.Path(model_output_path).mkdir(parents = True, exist_ok = True)
cmd = f"./xlearn_train {save_path}/{TRAIN_DATA_FILE_NAME} \
-v {save_path}/{TEST_DATA_FILE_NAME} -x auc -s 2 -k 32 -m {model_output_path}/model.out \
-t {model_output_path}/model.txt -b 0.001 --disk 2>&1 | tee \
{model_output_path}/logs"
os.system(cmd)
log_print(f"Model trained and saved in {model_output_path}")

Training started for label is_vp_succ in Bengali .......
Model trained and saved in ./train_test_data_models/Bengali/is_like/out


In [36]:
log_print(f"{save_path}/{TRAIN_DATA_FILE_NAME}")

./train_test_data_models/Bengali/is_like/train.txt


In [None]:
client.delete_table(temp_q1_table_path)

In [44]:
pathlib.Path(f"./train_test_data/{rating_def}/{lang}").mkdir(parents = True, exist_ok = True)

In [45]:
sql = f"""
SELECT * FROM `{temp_q3_table_path}`
"""
data_df = client.query(sql).to_dataframe()

In [52]:
train_df, test_df = train_test_split(data_df, test_size = 0.2, random_state=RANDOM_SEED)

In [53]:
save_path = f"./train_test_data/{rating_def}/{lang}"
pathlib.Path(save_path).mkdir(parents = True, exist_ok = True)
test_df.to_csv(os.path.join(save_path, "test.txt"), sep="\n", header = False, index=False)

In [59]:
! ./xlearn_train ./train_test_data/is_vp_succ2/Odia/train.txt -v ./train_test_data/is_vp_succ2/Odia/test.txt -x auc -s 2 -k 32 -m out/model.out -t out/model.txt -b 0.001 --disk 2>&1 | tee out/logs

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.44 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 64 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mNumber of Feature: 517707
[32m[------------] [0mNumber of Field: 2
[32m[------------] [0mTime cost for reading problem: 23.14 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 256.74 MB
[32m[------------] [0mTime cost for model initial: 0.34 (sec)
[32m[1m[ ACTION     ] Start to train ...[0m
[32m[------------][0m Epoch      Train log_loss       Test log_loss            Test AUC   

In [72]:
cmd = f"./xlearn_train {save_path}/train.txt \
-v {save_path}/test.txt -x auc -s 2 -k 32 -m {model_output_path}/model.out \
-t {model_output_path}/model.txt -b 0.001 --disk 2>&1 | tee \
{model_output_path}/logs"

In [63]:
model_output_path = os.path.join(save_path, "out")
pathlib.Path(model_output_path).mkdir(parents = True, exist_ok = True)

In [62]:
model_out_path

In [73]:
cmd

'./xlearn_train ./train_test_data/is_vp_succ2/Odia/train.txt -v ./train_test_data/is_vp_succ2/Odia/test.txt -x auc -s 2 -k 32 -m ./train_test_data/is_vp_succ2/Odia/out/model.out -t ./train_test_data/is_vp_succ2/Odia/out/model.txt -b 0.001 --disk 2>&1 | tee ./train_test_data/is_vp_succ2/Odia/out/logs'

In [74]:
os.system("touch check_file.txt")

0

In [43]:
train_delete_table_paths

['maximal-furnace-783.rohitrr.train_temp_q0_table_Odia_is_vp_succ2',
 'maximal-furnace-783.rohitrr.train_temp_q1_table_Odia_is_vp_succ2',
 'maximal-furnace-783.rohitrr.train_temp_q2_table_Odia_is_vp_succ2',
 'maximal-furnace-783.rohitrr.train_temp_q3_table_Odia_is_vp_succ2']

In [17]:
import csv

In [53]:
count = 10
rows = []

In [57]:
with open("./train_test_data_models/is_vp_succ2/Kannada/user_post_ffm_mapping.csv") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter = ",")
    i = 0
    for row in csv_reader:
        rows.append(row)
        i+=1
        if(i > count):
            break

In [58]:
rows[1]

['1_post_1000004482', '1']

In [36]:
f = open("./train_test_data_models/is_vp_succ2/Kannada/user_post_ffm_mapping.csv")

In [37]:
next(f)

'1_post_1000004482\t1\n'

In [1]:
raise Exception("check")

Exception: check

In [44]:
with open("./train_test_data_models/Bengali/is_vp_succ/test.txt") as f:
    lines = f.readlines()

In [45]:
count = 0
for l in lines:
    if(l[0] == "1"):
        count+=1
log_print(count/len(lines))

0.21455608238342266


In [11]:
sql = readSqlFile(f"{QUERY_FOLDER_PATH}/query0.sql", lang = "Hindi",
                  common_posts_end_time = "akahfk",
                  common_posts_days = "afkhd",
                  end_time="afljds", days = "ajkfdks")

AttributeError: 'str' object has no attribute 'strftime'

In [13]:
f"{QUERY_FOLDER_PAT}afsd"

NameError: name 'QUERY_FOLDER_PAT' is not defined

In [34]:
import ipynb_path

In [35]:
ipynb_path.get()

ValueError: Could not find the current session.

In [36]:
%%javascript
var nb = IPython.notebook;
var kernel = IPython.notebook.kernel;
var command = "NOTEBOOK_FULL_PATH = '" + nb.base_url + nb.notebook_path + "'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [1]:
import ipynbname
nb_fname = ipynbname.name()

In [2]:
nb_fname

'user_context_ffm_training_pipeline_small_languages'

In [3]:
from utils import simple_logging

In [4]:
nb_name = ipynbname.name()
logger = simple_logging.get_standard_logger(nb_name, file_path=f"logs/{nb_name}", overwrite_file=True, stream=True)

level set


In [5]:
logger.info("Test - 1")

Test - 1


In [9]:
log_print  = logger.info

In [10]:
log_print("5")

5


In [25]:
def check_func(l):
    try:
        if(l == 1):
            print(l, " returning")
            return l+1
    except Exception as e:
        print("Exception")
        return l-1
    print("After returning")
    return l+3

In [27]:
check_func(1)

1  returning


2