In [1]:
import numpy as np
from utils.download_from_GCP import download_table_to_local_as_one_file
import pandas as pd
import os
from sklearn.metrics import ndcg_score, roc_auc_score
from tqdm import tqdm
from multiprocessing import Pool
import multiprocessing as mp
import pathlib
import xlearn as xl

In [2]:
LANGS = [
#     "Punjabi",
    "Hindi",
    # "Tamil",
#     "Telugu",
#     "Kannada",
#     "Odia",
#     "Bengali",
#     "Marathi",
#     "Malayalam",
#     "Gujarati",
]
# DAYS_OF_DATA_CONSIDERED = 7
rating_def_dict = {
#         "vplay": "is_vp_succ",
# #         "share": "is_share",
# #         "fav": "is_fav",
        "like": "is_like",
        "vplay2": "is_vp_succ2",
#         "vplay_skip": "is_vp_skip",
#     "vclick": "is_vp_click"
}

# table_path = "maximal-furnace-783.rohitrr.test_temp_q1_table_Malayalam_is_vp_succ2"
# local_save_path = "./train_test_data_models/Malayalam/is_vp_succ2"
# out_file_name = "test_q1.csv"
min_pos_labels = 1
min_total_user_events = 1
num_users_to_consider = 20000
RANDOM_SEED = 9745
TEST_DATA_FILE_NAME = "test"
USER_CONTEXT = "location"
DTYPE="video"
results_path = f"results/user_bucketing_metrics_1.csv"
new_results_file = False

In [3]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

In [4]:
def predict_scores(test_file_path, test_model_path, predicted_output_folder_path, 
                   rating_def, lang, predicted_output_file_name = "predicted_output.txt"):
    pathlib.Path(predicted_output_folder_path).mkdir(parents = True, exist_ok = True)
    predicted_output_file_path = os.path.join(predicted_output_folder_path,
                                             predicted_output_file_name)
    res_out_path = os.path.join(predicted_output_folder_path, "results.csv")
    
    print(f"Predicting labels for {rating_def}-{lang} .... ")
    ffm_model = xl.create_ffm() 
    ffm_model.setTest(test_file_path)
    ffm_model.setSigmoid()
    ffm_model.predict(test_model_path, predicted_output_file_path)
    print(f"Predicted labels stored in {predicted_output_file_path}")

def get_test_df(test_file_path, predicted_results_path):
    print(f"Reading - {predicted_results_path}")
    with open(predicted_results_path) as f:
        lines = f.readlines()
        predicted_scores = [float(score.replace('\n','')) for score in lines]
    user_mappings = []
    scores = []
    
    print(f"Reading - {test_file_path}")
    with open(test_file_path) as f:
        for l in f:
            user_mappings.append(int(l.split(':')[1]))
            scores.append(int(l[0]))
    dataframe_dict = {
        "user_mapping": user_mappings,
        "score": scores,
        "predicted_score": predicted_scores
    }
    df = pd.DataFrame(dataframe_dict)
    return df

    
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def calculate_user_level_metric_scores(true_vals, predicted_vals):
        user_auc_score = roc_auc_score(true_vals, predicted_vals)
        user_at_5_ndcg_score = ndcg_score(true_vals[np.newaxis, :], predicted_vals[np.newaxis, :],
                                         k = 5)
        user_at_10_ndcg_score = ndcg_score(true_vals[np.newaxis, :], predicted_vals[np.newaxis, :],
                                         k = 10)
        relevant_recommendations = np.where(true_vals == 1)[0]
        ordered_recommendations = np.argsort(-predicted_vals)
        user_mapk_at_5_score = apk(relevant_recommendations, ordered_recommendations, k=5)
        user_mapk_at_10_score = apk(relevant_recommendations, ordered_recommendations, k=10)
        return [user_auc_score, user_at_5_ndcg_score, user_at_10_ndcg_score,
                user_mapk_at_5_score, user_mapk_at_10_score]
    #     print("appended")

def get_auc_score(test_df):
    scores = test_df["score"].values
    predicted_scores = test_df["predicted_score"].values
    print(f"Calculating overall AUC scores")
    auc_overall = roc_auc_score(scores, predicted_scores)
    print(f"Overall AUC scores computed")
    return auc_overall

def filter_user_level_indices(df):
    print(f"Separating data into groups .....")
    agg_df = df[['user_mapping', 'score']].groupby(['user_mapping']).agg(['sum', 'count'])
    agg_df = agg_df[(agg_df["score"]["sum"] >= min_pos_labels) \
                     & (agg_df["score"]["count"] >= min_total_user_events) \
                   & (agg_df["score"]["sum"] != agg_df["score"]["count"])]
    selected_user_mappings = agg_df.sample(n = min(num_users_to_consider, agg_df.shape[0]), 
                                     replace=False, random_state=RANDOM_SEED).index.values
    user_level_inds = df.index[df.user_mapping.isin(selected_user_mappings)]
    print(f"No. of users in user level metrics computations - {len(selected_user_mappings)}")
    print("Filtered data")
    return user_level_inds
    
def get_user_level_results(df):
    df = df.sort_values("user_mapping")
    all_user_mappings = df.user_mapping.values
    scores = df.score.values
    predicted_scores = df.predicted_score.values
    ukeys, index = np.unique(all_user_mappings, True)
    user_level_true_vals = np.split(scores, index[1:])
    user_level_predicted_vals = np.split(predicted_scores, index[1:])
    input_vals = list(zip(user_level_true_vals, user_level_predicted_vals))
    
    print(f"Computing different scores ....")
    with Pool(processes = 48) as sub_pool:
        res = sub_pool.starmap(calculate_user_level_metric_scores, input_vals)       
    means = np.array(res).mean(axis = 0)
    
    results_dict = {
        "AUC score - User Level": means[0],
        "NDCG@5 Score - User Level": means[1],
        "NDCG@10 Score - User Level": means[2],
        "MAPK@5 score - User Level": means[3],
        "MAPK@10 score - User Level": means[4]
        }
    print("Completed computing results - {}".format(results_dict))
    return results_dict

# def get_model_scores()

In [5]:
results = []
for lang in LANGS:
    for rating_key, rating_def in rating_def_dict.items():
        test_file_path = f"./train_test_data_models/{USER_CONTEXT}/{DTYPE}/{lang}/{rating_def}/{TEST_DATA_FILE_NAME}.txt"
        trained_model_path = f"./train_test_data_models/{USER_CONTEXT}/{DTYPE}/{lang}/{rating_def}/out/model.out"
        predicted_output_folder_path = f"./train_test_data_models/{USER_CONTEXT}/{DTYPE}/{lang}/{rating_def}/predicted_results"
        user_buckets_path = f"./train_test_data_models/{USER_CONTEXT}/{DTYPE}/{lang}/{rating_def}/user_buckets.csv"
        mapping_file_path = f"./train_test_data_models/{USER_CONTEXT}/{DTYPE}/{lang}/{rating_def}/user_post_ffm_mapping.csv"
        predicted_output_file_name = "predicted_scores.txt"
        
        predict_scores(test_file_path, trained_model_path, predicted_output_folder_path,
                      lang, rating_def, predicted_output_file_name=predicted_output_file_name)
        predicted_output_file_path = os.path.join(predicted_output_folder_path, 
                                                 predicted_output_file_name)
        test_df = get_test_df(test_file_path, predicted_output_file_path)
        print("Getting and transforming user buckets ....")
        user_buckets = pd.read_csv(user_buckets_path,
                                delimiter="\t")
        user_buckets["userId"] = user_buckets.userId.map(lambda x: "2_user_"+str(x))
        mapping_df = pd.read_csv(mapping_file_path,
                        delimiter="\t", index_col="feature_name")
        user_buckets["user_mapping"] = mapping_df.loc[list(user_buckets.userId.values)].mapping.values

        print(f"Computing scores for each bucket - {USER_CONTEXT}/{DTYPE}/{lang}/{rating_def}")
        for i in range(10):
            res = {
                    "User Context": USER_CONTEXT,
                    "Content Type": DTYPE,
                    "Language": lang,
                    "Action Type": rating_key
                }
            res["Bucket"] = i+1
            user_bucket_mappings = user_buckets[user_buckets.user_bucket==i+1].user_mapping.values
            bucket_test_df = test_df[test_df.user_mapping.isin(user_bucket_mappings)]
            res["Num Bucket Samples"] = bucket_test_df.shape[0]
            res["AUC score"] = get_auc_score(bucket_test_df)
            user_level_indices = filter_user_level_indices(bucket_test_df)
            user_level_df = bucket_test_df.loc[user_level_indices]
            res.update(get_user_level_results(user_level_df))

            if(new_results_file):
                pd.DataFrame(columns = list(res.keys())).to_csv(results_path, index=False)
                new_results_file = False
            pd.DataFrame([res]).to_csv(results_path, index=False, header=False, mode='a')
            results.append(res)
print(results)

Reading - ./train_test_data_models/location/video/Hindi/is_like/predicted_results/predicted_scores.txt
Reading - ./train_test_data_models/location/video/Hindi/is_like/test.txt
Getting and transforming user buckets ....
Computing scores for each bucket - location/video/Hindi/is_like
Calculating overall AUC scores
Overall AUC scores computed
Separating data into groups .....
No. of users in user level metrics computations - 20000
Filtered data
Computing different scores ....
Completed computing results - {'AUC score - User Level': 0.6473710959751081, 'NDCG@5 Score - User Level': 0.4812755679184771, 'NDCG@10 Score - User Level': 0.5353513155103987, 'MAPK@5 score - User Level': 0.3942697499999991, 'MAPK@10 score - User Level': 0.4191203939200643}
Calculating overall AUC scores
Overall AUC scores computed
Separating data into groups .....
No. of users in user level metrics computations - 20000
Filtered data
Computing different scores ....
Completed computing results - {'AUC score - User Lev

### Scrap code

In [5]:
bucket_test_df.tail(5)

NameError: name 'bucket_test_df' is not defined

In [28]:
len(user_level_indices), bucket_test_df.shape, test_df.shape

(607732, (6841216, 3), (769910170, 3))

In [45]:
user_level_df = bucket_test_df.loc[user_level_indices]

In [47]:
res.update(get_user_level_results(user_level_df))
print(res)

Computing different scores ....
Completed computing results - {'AUC score - User Level': 0.7329352687842821, 'NDCG@5 Score - User Level': 0.6356111220019085, 'NDCG@10 Score - User Level': 0.6766040128600278, 'MAPK@5 score - User Level': 0.5406051805555684, 'MAPK@10 score - User Level': 0.5571550874275671}
{'User Context': 'price', 'Content Type': 'video', 'Language': 'Hindi', 'Action Type': 'vplay2', 'Bucket': 1, 'AUC score': 0.7717333185050684, 'AUC score - User Level': 0.7329352687842821, 'NDCG@5 Score - User Level': 0.6356111220019085, 'NDCG@10 Score - User Level': 0.6766040128600278, 'MAPK@5 score - User Level': 0.5406051805555684, 'MAPK@10 score - User Level': 0.5571550874275671}


In [40]:
bucket_test_df.tail(5)

Unnamed: 0,user_mapping,score,predicted_score
769909377,19872422,1,0.380433
769909796,9833250,0,0.330722
769909852,12746792,0,0.030882
769909980,15639665,0,0.254021
769910149,8396586,0,0.126932


In [46]:
user_level_df.head(5)

Unnamed: 0,user_mapping,score,predicted_score
2067,14689468,0,0.298931
2402,10526039,0,0.064852
2983,1465576,0,0.078399
4597,11590350,1,0.101524
4745,13141315,0,0.695729


In [39]:
user_level_indices[-1]

769908636

In [36]:
user_level_df

Unnamed: 0,user_mapping,score,predicted_score
239827,16662258,0,0.086504
281687,12484068,0,0.22902
351794,11990025,0,0.177217


In [6]:
for lang in LANGS:
    for rating_key, rating_def in rating_def_dict.items():
        print("Getting results for {}-{}".format(lang, rating_def))
        predicted_results_path = f"./train_test_data_models/{lang}/{rating_def}/predicted_results/predicted_scores.txt"
        test_file_path = f"./train_test_data_models/{lang}/{rating_def}/test.txt"
        print(f"Opening {predicted_results_path} and {test_file_path} - reading input ............")
        with open(predicted_results_path) as f:
            lines = f.readlines()
            predicted_scores = [float(score.replace('\n','')) for score in lines]
        user_mappings = []
        scores = []
        with open(test_file_path) as f:
            for l in f:
                user_mappings.append(int(l.split(':')[1]))
                scores.append(int(l[0]))
        dataframe_dict = {
            "user_mapping": user_mappings,
            "score": scores,
            "predicted_score": predicted_scores
        }
        df = pd.DataFrame(dataframe_dict)
        print(f"Created test dataframe for {lang}-{rating_def}")

        print(f"Separating data into groups {lang}-{rating_def} .....")
        agg_df = df[['user_mapping', 'score']].groupby(['user_mapping']).agg(['sum', 'count'])
        agg_df = agg_df[(agg_df["score"]["sum"] >= min_pos_labels) \
                         & (agg_df["score"]["count"] >= min_total_user_events) \
                       & (agg_df["score"]["sum"] != agg_df["score"]["count"])]
        selected_user_mappings = agg_df.sample(n = num_users_to_consider, replace=False).index.values

        df = df[df.user_mapping.isin(selected_user_mappings)]
        df = df.sort_values("user_mapping")
        all_user_mapping = df.user_mapping.values
        scores = df.score.values
        predicted_scores = df.predicted_score.values
        ukeys, index = np.unique(all_user_mapping, True)
        user_level_true_vals = np.split(scores, index[1:])
        user_level_predicted_vals = np.split(predicted_scores, index[1:])
        input_vals = list(zip(user_level_true_vals, user_level_predicted_vals))
        print(f"Input separated into groups for {lang}-{rating_def}")
        print(f"Computing scores for {lang}-{rating_def}")
    #     with Pool(processes = 24) as sub_pool:
    #         res = sub_pool.starmap(calculate_metric_scores, input_vals)
        res = []
        for true_vals, predicted_vals in tqdm(zip(user_level_true_vals, user_level_predicted_vals)):
#             print(true_vals, predicted_vals)
            temp_res = calculate_user_level_metric_scores(true_vals, predicted_vals)
            res.append(temp_res)

        means = np.array(res).mean(axis = 0)

        results_dict = {
            "Rating Definition": rating_key,
            "Language": lang,
            "NDCG Score - User Level": means[0],
            "AUC score - User Level": means[1],
            "MAPK score - User Level": means[2]
            }
        print(results_dict)

Getting results for Malayalam-is_vp_skip
Opening ./train_test_data_models/Malayalam/is_vp_skip/predicted_results/predicted_scores.txt and ./train_test_data_models/Malayalam/is_vp_skip/test.txt - reading input ............


KeyboardInterrupt: 

In [6]:
for lang in LANGS:
    for rating_key, rating_def in rating_def_dict.items():
        print("Getting results for {}-{}".format(lang, rating_def))
        predicted_results_path = f"./train_test_data_models/{lang}/{rating_def}/predicted_results/predicted_scores.txt"
#         trained_model_path = f"./train_test_data_models/{lang}/{rating_def}/out/model.out"
#         predicted_output_folder_path = f"./train_test_data_models/{lang}/{rating_def}/predicted_results"
#         predicted_output_file_name = "predicted_scores.txt"
        test_file_path = f"./train_test_data_models/{lang}/{rating_def}/test.txt"
#         predicted_results_path = f"./train_test_data_models/{lang}/{rating_def}/predicted_results/predicted_scores.txt"
#         predict_scores(test_file_path, trained_model_path, predicted_output_folder_path,
#                       lang, rating_def, predicted_output_file_name=predicted_output_file_name)
#         predicted_results_path = os.path.join(predicted_output_folder_path, predicted_output_file_name)
        
        test_file_path = f"./train_test_data_models/{lang}/{rating_def}/test.txt"
        print("Getting results for {}-{}".format(lang, rating_def))
        print(f"Opening {predicted_results_path} and {test_file_path} - reading input ............")
        with open(predicted_results_path) as f:
            lines = f.readlines()
            predicted_scores = [float(score.replace('\n','')) for score in lines]
        user_mappings = []
        scores = []
        with open(test_file_path) as f:
            for l in f:
                user_mappings.append(int(l.split(':')[1]))
                scores.append(int(l[0]))
        dataframe_dict = {
            "user_mapping": user_mappings,
            "score": scores,
            "predicted_score": predicted_scores
        }
        print(f"Calculating overall AUC scores - {lang}-{rating_def}")
        auc_overall = roc_auc_score(scores, predicted_scores)
        print(f"Overall AUC scores computed - {lang}-{rating_def}")

        df = pd.DataFrame(dataframe_dict)
        print(f"Created test dataframe for {lang}-{rating_def}")

        print(f"Separating data into groups {lang}-{rating_def} .....")
        agg_df = df[['user_mapping', 'score']].groupby(['user_mapping']).agg(['sum', 'count'])
        agg_df = agg_df[(agg_df["score"]["sum"] >= min_pos_labels) \
                         & (agg_df["score"]["count"] >= min_total_user_events) \
                       & (agg_df["score"]["sum"] != agg_df["score"]["count"])]
        selected_user_mappings = agg_df.sample(n = num_users_to_consider, replace=False).index.values

        df = df[df.user_mapping.isin(selected_user_mappings)]
        df = df.sort_values("user_mapping")
        all_user_mapping = df.user_mapping.values
        scores = df.score.values
        predicted_scores = df.predicted_score.values
        ukeys, index = np.unique(all_user_mapping, True)
        user_level_true_vals = np.split(scores, index[1:])
        user_level_predicted_vals = np.split(predicted_scores, index[1:])
        input_vals = list(zip(user_level_true_vals, user_level_predicted_vals))
        print(f"Input separated into groups for {lang}-{rating_def}")
        print(f"Computing scores for {lang}-{rating_def}")
        with Pool(processes = 48) as sub_pool:
            res = sub_pool.starmap(calculate_user_level_metric_scores, input_vals)
#             res = []
#             for true_vals, predicted_vals in tqdm(zip(user_level_true_vals, user_level_predicted_vals)):
#         #             print(true_vals, predicted_vals)
#                 temp_res = calculate_user_level_metric_scores(true_vals, predicted_vals)
#                 res.append(temp_res)        
        means = np.array(res).mean(axis = 0)

        results_dict = {
            "Rating Definition": rating_def,
            "Language": lang,
            "AUC score - Overall": auc_overall,
            "NDCG@5 Score - User Level": means[0],
            "NDCG@10 Score - User Level": means[1],
            "AUC score - User Level": means[2],
            "MAPK@5 score - User Level": means[3],
            "MAPK@10 score - User Level": means[4]
            }
        print("Completed computing results for {} {}".format(lang, rating_def))
        print(results_dict)
        df = pd.DataFrame([results_dict])
        df.to_csv("./results/Hindi_rest_res_1.csv", mode='a', 
                  index=False, header=False)   

Getting results for Hindi-is_vp_succ2
Getting results for Hindi-is_vp_succ2
Opening ./train_test_data_models/Hindi/is_vp_succ2/predicted_results/predicted_scores.txt and ./train_test_data_models/Hindi/is_vp_succ2/test.txt - reading input ............
Calculating overall AUC scores - Hindi-is_vp_succ2
Overall AUC scores computed - Hindi-is_vp_succ2
Created test dataframe for Hindi-is_vp_succ2
Separating data into groups Hindi-is_vp_succ2 .....
Input separated into groups for Hindi-is_vp_succ2
Computing scores for Hindi-is_vp_succ2
Completed computing results for Hindi is_vp_succ2
{'Rating Definition': 'is_vp_succ2', 'Language': 'Hindi', 'AUC score - Overall': 0.8094117656071651, 'NDCG@5 Score - User Level': 0.6335483608648548, 'NDCG@10 Score - User Level': 0.5937726673437211, 'AUC score - User Level': 0.7559612853287495, 'MAPK@5 score - User Level': 0.5233338333333528, 'MAPK@10 score - User Level': 0.4437434563492078}
Getting results for Hindi-is_vp_skip
Getting results for Hindi-is_vp_

In [10]:
results_dict

{'Rating Definition': 'is_vp_skip',
 'Language': 'Malayalam',
 'AUC score - Overall': 0.7403779265112256,
 'NDCG@5 Score - User Level': 0.7511589330745353,
 'NDCG@10 Score - User Level': 0.714626302675468,
 'AUC score - User Level': 0.6773018945198868,
 'MAPK@5 score - User Level': 0.6617723333333213,
 'MAPK@10 score - User Level': 0.5869704742063461}

In [None]:
df = pd.DataFrame(results)
df.to_csv("./aggregated_results/other_metrics_1.csv", mode='a', 
          index=False, header=False)

In [7]:
len(predicted_scores), len(scores)

(495644647, 769884558)

In [174]:
%%time
user_mapping_df = df[df.user_mapping == user_mapping]

CPU times: user 47.1 ms, sys: 19.9 ms, total: 67 ms
Wall time: 65.6 ms


In [47]:
len(df.predicted_score.values)

2447441

In [183]:
temp_df = df[df.user_mapping.isin(selected_user_mappings)]

In [132]:
temp_df = df[df.userId.isin(selected_userIds)]

In [49]:
df = df.sort_values("user_mapping")
all_user_mapping = df.user_mapping.values
scores = df.score.values
precicted_scores = df.predicted_score.values
len(all_user_mapping), len(scores), len(predicted_scores)

(2447441, 2447441, 62711999)

In [50]:
df.head(5)

Unnamed: 0,user_mapping,score,predicted_score
7782684,68939,0,0.788298
7032797,68939,0,0.687772
7017182,68939,1,0.500768
12573382,68939,0,0.31304
53359825,68939,0,0.577004


In [23]:
df.groupby(['postId']).sum().sort_values(by=['score'], ascending=False).head(5)

Unnamed: 0_level_0,userId,tagId,score,predicted_score
postId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9857235382,119103269175624,199314537408,77330,25836.966834
3365384382,116716063066117,195051641932,72874,25429.166583
9329535382,106995824611773,178190688536,70690,23113.308994
9062058282,98212164199390,163874211312,68897,21243.150008
1372802382,90735147065879,152006896006,68039,19701.435099


In [27]:
agg_df = df[['userId', 'score']].groupby(['userId']).agg(['sum', 'count'])

In [134]:
df.shape

(55914986, 5)

In [31]:
agg_df.columns

MultiIndex([('score',   'sum'),
            ('score', 'count')],
           )

In [32]:
agg_df.index

Int64Index([     53871,      55059,      55062,      55564,      56773,
                 57106,      57358,      57549,      58644,      58785,
            ...
            2777448214, 2777458053, 2777461328, 2777461595, 2777473212,
            2777478170, 2777479686, 2777479756, 2777481430, 2777483046],
           dtype='int64', name='userId', length=1125986)

In [35]:
agg_df[('score', 'sum')].values

array([18,  3, 22, ...,  1,  3,  1])

In [44]:
temp_df = agg_df[(agg_df["score"]["sum"] >= min_pos_labels) & (agg_df["score"]["count"] >= min_total_user_events)]

In [49]:
temp_df["userId"]

KeyError: 'userId'

In [51]:
t = agg_df.sample(n = 20000, replace=False).head(5)

In [54]:
t.index.values

array([ 176951320, 2618125270, 2580975920,  323434670,  352372614])

In [55]:
df.head(5)

Unnamed: 0,userId,postId,tagId,score,predicted_score
0,1114791014,7031720382,102871,0,0.031885
1,453980094,5351672082,830883,0,0.023767
2,2738524875,5325077382,3464496,0,0.144696
3,53693143,9836841382,1381366,1,0.0449
4,618968903,5837607382,4474183,0,0.034953


In [57]:
df[df.userId.isin(t.index.values)]

Unnamed: 0,userId,postId,tagId,score,predicted_score
46573,2618125270,7919181382,1381366,1,0.278692
86354,2618125270,7967645182,74,0,0.128524
88520,352372614,7212202382,1272240,0,0.023171
126024,2618125270,7981172182,9777582,0,0.143801
207918,352372614,1458832382,1381366,0,0.022506
...,...,...,...,...,...
55210329,352372614,1491271182,8291806,0,0.080947
55339373,176951320,7423722282,711045,0,0.826163
55607095,352372614,7217636382,1272240,1,0.078268
55723968,2618125270,3287529082,102871,0,0.182957


In [65]:
temp = df[df.userId == 176951320].sort_values(by = ['predicted_score'])

In [67]:
ndcg_score(temp.score.values, temp.predicted_score.values)

ValueError: Only ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput') formats are supported. Got binary instead

In [68]:
tr_vals = np.asarray([temp.score.values])
pr_vals = np.asarray([temp.predicted_score.values])

In [69]:
ndcg_score(tr_vals,pr_vals)

0.7520911855422138

In [77]:
ndcg_score(np.array([[3,1,1,0,0]]), np.array([[2,1,0.2,0,0]]))

1.0

In [111]:
roc_auc_score(tr_vals[0], pr_vals[0])

0.375

In [112]:
tr_vals

array([[1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1]])

In [127]:
relevant_recommendations = np.where(tr_vals[0] == 1)[0]
ordered_recommendations = np.argsort(-pr_vals[0])
apk(relevant_recommendations, ordered_recommendations)

0.33999999999999997

array([ 0,  2,  3,  8, 12])

In [123]:
ordered_recommendations

array([12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0])

In [125]:
tr_vals[0]

1

In [149]:
path = "./train_test_data_models/Malayalam/is_vp_succ2/test_later.txt"

In [150]:
with open(path) as f:
    lines = f.readlines()
    labels = [int(l.replace('\n', '')[0]) for l in lines]

In [151]:
labels[:5]

[0, 0, 0, 0, 0]

In [152]:
predicted_scores[:5]

[0.0318848, 0.0237671, 0.144696, 0.0449003, 0.0349526]

In [153]:
roc_auc_score(labels, predicted_scores)

0.8400228558390365

In [154]:
lines[0]

'0 0:347890:1 1:1475:1 \n'

In [155]:
lines[0].split(':')

['0 0', '347890', '1 1', '1475', '1 \n']

In [186]:
a = {
    "c1": [1,2,2,3,2,2,4,5,5,9],
    "c2": [1,0,0,0,1,1,1,0,1,0]
}

In [187]:
t_df = pd.DataFrame(a)

In [194]:
t_df.groupby("c1").filter(lambda x: x.c1.max() < 5)

Unnamed: 0,c1,c2
0,1,1
1,2,0
2,2,0
3,3,0
4,2,1
5,2,1
6,4,1


In [202]:
def apply_to_group(x, selected_user_mappings):
    if x.user_mapping in selected_user_mappings:
        true_vals = x.score.values
        predicted_vals = x.predicted_score.values
        print(true_vals, predicted_vals)
        user_ndcg_score = ndcg_score(true_vals[np.newaxis, :], predicted_vals[np.newaxis, :])
        relevant_recommendations = np.where(true_vals == 1)[0]
        ordered_recommendations = np.argsort(-predicted_vals)
        user_apk_score = apk(relevant_recommendations, ordered_recommendations)
        return (user_ndcg_score, user_auc_score, user_apk_score)
    else:
        return None, None, None

In [5]:
def mp_1(p1, p2):
    p1 = p1[0]
    p2 = p2[0]
    return [p1+p1*p2, p1-p1*p2]
params = [([1],[2]),([2],[3]),[[3],[4]],[[4],[5]]]
with Pool(processes=10) as pool:
    res = pool.starmap(mp_1, params)

In [4]:
res

[[3, -1], [8, -4], [15, -9], [24, -16]]

In [6]:
res

[[3, -1], [8, -4], [15, -9], [24, -16]]

In [9]:
l = [np.arange(3), np.arange(3)+2]

In [10]:
l

[array([0, 1, 2]), array([2, 3, 4])]

In [11]:
np.concatenate(l)

array([0, 1, 2, 2, 3, 4])

In [12]:
np.array(l).mean(axis = 0)

array([1., 2., 3.])

In [7]:
a = {
    "c1": [1,2,2,3,2,2,4,5,5,9],
    "c2": [1,0,0,0,1,1,1,0,1,0]
}

In [9]:
df = pd.DataFrame(a)

In [21]:
temp_1 = df.sort_values('c1').c2.values

In [12]:
temp = df.sort_values('c1').c1.values

In [16]:
_, index = np.unique(temp, True)

In [23]:
np.split(temp_1, index[1:])

[array([1]),
 array([0, 0, 1, 1]),
 array([0]),
 array([1]),
 array([0, 1]),
 array([0])]

In [24]:
df.sort_values('c1')

Unnamed: 0,c1,c2
0,1,1
1,2,0
2,2,0
4,2,1
5,2,1
3,3,0
6,4,1
7,5,0
8,5,1
9,9,0


In [39]:
from sklearn.metrics import ndcg_score
from math import log
# we have groud-truth relevance of some answers to a query:
true_relevance = np.asarray([[1, 1, 0, 0, 1, 0]])
# we predict some scores (relevance) for the answers
scores = np.asarray([[.9, .85, .8, .7, .65, 0.15]])
ndcg_score(true_relevance, scores)

# scores = np.asarray([[.05, 1.1, 1., .5, .0]])
# ndcg_score(true_relevance, scores)

# we can set k to truncate the sum; only top k answers contribute.


# the normalization takes k into account so a perfect answer
# would still get 1.

0.9469024295259745

In [35]:
ndcg_score(true_relevance, scores, k=3)

0.7653606369886218

In [32]:
true_relevance = np.asarray([[1, 1, 0, 0]])
# we predict some scores (relevance) for the answers
scores = np.asarray([[.9, .85, .8, .7]])
ndcg_score(true_relevance, scores)

1.0

In [40]:
1/log(2,2)+1/log(3,2)+1/log(4,2)

2.1309297535714573

In [41]:
1/log(2,2)+1/log(3,2)

1.6309297535714573

In [42]:
1.63/2.13

0.7652582159624413

In [1]:
temp_df = pd.read_csv("./train_test_data_models/Kannada/is_vp_succ2/user_post_ffm_mapping.csv")

NameError: name 'pd' is not defined

In [1]:
import pandas as pd

In [21]:
df = pd.DataFrame(columns=["col1","col2"])

In [22]:
df.to_csv("./results/temp_res.csv", index=False)

In [23]:
pd.DataFrame([{"col1": 1, "col2":2}]).to_csv("./results/temp_res.csv", index=False, header=False, mode='a')

In [17]:
df

Unnamed: 0,col1,col2


In [24]:
temp = {
    "col1": 3
}

In [26]:
list(temp.keys())

['col1']

In [6]:
mapping_df = pd.read_csv("./train_test_data_models/price/video/Hindi/is_vp_succ2/user_post_ffm_mapping.csv",
                        delimiter="\t", index_col="feature_name")

In [7]:
user_buckets = pd.read_csv("./train_test_data_models/price/video/Hindi/is_vp_succ2/user_buckets.csv",
                        delimiter="\t")

In [18]:
mapping_df.head(5)

Unnamed: 0_level_0,mapping
feature_name,Unnamed: 1_level_1
1_post_1000001092,1
1_post_1000017582,2
1_post_1000035292,3
1_post_1000063692,4
1_post_1000143292,5


In [7]:
user_buckets.head(5)

Unnamed: 0,userId,user_bucket
0,617518841,1
1,1239023680,1
2,335851653,1
3,758602948,1
4,466911063,1


In [9]:
user_buckets["userId"] = user_buckets.userId.apply(lambda x: "2_user_"+str(x))

In [10]:
user_buckets.head(5)

Unnamed: 0,userId,user_bucket
0,2_user_617518841,1
1,2_user_1239023680,1
2,2_user_335851653,1
3,2_user_758602948,1
4,2_user_466911063,1


In [31]:
mapping_df.loc["2_user_617518841"].mapping

20848930

In [8]:
user_buckets["user_mapping"] = mapping_df.loc[list(user_buckets.userId.values)].mapping.values

KeyError: "None of [Int64Index([ 617518841, 1239023680,  335851653,  758602948,  466911063,\n             476648811, 1243857180, 2191549964, 1220246782, 1783813496,\n            ...\n              79850567,  436423780, 1121830546, 1606275329, 2003154809,\n            1467283180,    8472216, 2249650404, 1796182349, 1137592414],\n           dtype='int64', name='feature_name', length=23816606)] are in the [index]"

In [None]:
user_buckets.head(5)

In [38]:
mapping_df.loc[['2_user_617518841',
 '2_user_1239023680',
 '2_user_335851653',
 '2_user_758602948']].mapping.values

array([20848930,  2146462, 15123015, 22143603])