In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from collections import Counter
import networkx as nx 

In [2]:
TRAIN_3D_PATH = '/home/core/shahjaidev/my_data/3d_train_csv'
TEST_3d_PATH = '/home/core/shahjaidev/my_data/3d_test'

UID_3D_PATH = '/home/core/shahjaidev/my_data/3d_uid.tsv'

FBV_TSV_PATH= '/home/core/shahjaidev/my_data/FBV/3d_FBV_data.tsv'

URL_TO_LANGUAGE_TSV_PATH= '/home/core/shahjaidev/my_data/retroindex/3d_train_url_to_language_country_mapping.tsv'

In [3]:
SWING_SCORES_0_to_2M_PICKLE_PATH = '/home/core/shahjaidev/SWING/3d_swing_latest/3d_swing_0_to_2M_i2i_dict_pickle'
SWING_SCORES_2M_to_4M_PICKLE_PATH = '/home/core/shahjaidev/SWING/3d_swing_latest/3d_swing_2M_to_4M_i2i_dict_pickle'
SWING_SCORES_4M_to_6M_PICKLE_PATH = '/home/core/shahjaidev/SWING/3d_swing_latest/3d_swing_4M_to_6M_i2i_dict_pickle'
SWING_SCORES_6M_to_8M_PICKLE_PATH = '/home/core/shahjaidev/SWING/3d_swing_latest/3d_swing_6M_to_end_i2i_dict_pickle'



In [5]:
def read_test_set(PATH):
    df = pd.read_csv(PATH, sep=',', header=None)
    df.columns = ['uid', 'tid']
    return df

def read_train_set(PATH):
    df = pd.read_csv(PATH, sep=',', header=None)
    df.columns = ['uid', 'tid', 'weight']
    return df

def read_uid_set(PATH):
    df = pd.read_csv(PATH, sep='\t', header=None)
    df.columns = ['uid', 'url']
    return df

def read_url_to_language_df(PATH):
    df = pd.read_csv(PATH, sep='\t', header=None)
    df.columns = ['url', 'language', 'country']
    return df

train_set_df= read_train_set(TRAIN_3D_PATH)
test_set_df= read_test_set(TEST_3d_PATH)
uid_df= read_uid_set(UID_3D_PATH)
url_to_language_df= read_url_to_language_df(URL_TO_LANGUAGE_TSV_PATH)




### Create NetworkX Graph, obtain uid to degree mapping

In [14]:
print("Creating NetworkX Graph")
G = nx.from_pandas_edgelist(train_set_df, 'uid', 'tid', edge_attr='weight')
print("NetworkX Graph created successfully")

Creating NetworkX Graph
NetworkX Graph created successfully


In [15]:
uid_to_degree_dict = dict(G.degree())

In [16]:
uids, degrees = zip(*uid_to_degree_dict.items())

In [18]:
uid_degree_df = pd.DataFrame({'uid': uids, 'degree': degrees})

In [10]:
##Merge test_set_df and uid_df
test_set_df_uid_merged = pd.merge(test_set_df, uid_df, on='uid')
test_set_df_merged = pd.merge(test_set_df_uid_merged, uid_degree_df, left_on='tid', right_on='uid')

### Read all swing dict pickles, convert to Counter than add

In [6]:
PATHS = [SWING_SCORES_0_to_2M_PICKLE_PATH] # SWING_SCORES_2M_to_5M_PICKLE_PATH, SWING_SCORES_5M_to_end_PICKLE_PATH]

In [7]:
def read_swing_dict_pickles_and_add(PATHS):
    final_counter_dict = Counter()
    for path in PATHS:
        #Read swing scores dict from pickle files
        with open(path, 'rb') as f:
            swing_dict = pick,le.load(f)
        final_counter_dict = final_counter_dict + Counter(swing_dict)
    return final_counter_dict

In [8]:
swing_scores_0_to_2M_dict = read_swing_dict_pickles_and_add(PATHS)

In [9]:
swing_scores_2M_to_4M_dict = read_swing_dict_pickles_and_add([SWING_SCORES_2M_to_4M_PICKLE_PATH])
swing_scores_4M_to_6M_dict = read_swing_dict_pickles_and_add([SWING_SCORES_4M_to_6M_PICKLE_PATH])
swing_scores_6M_to_8M_dict = read_swing_dict_pickles_and_add([SWING_SCORES_6M_to_8M_PICKLE_PATH])

In [10]:
def get_swing_df(swing_dict):
    uids, tids = zip(*swing_dict.keys())
    scores = list(swing_dict.values())
    swing_df = pd.DataFrame({'uid': uids, 'tid': tids, 'score': scores})
    swing_df_filtered = swing_df[swing_df.uid != swing_df.tid]
    return swing_df_filtered


In [11]:
swing_scores_df_0_to_2M = get_swing_df(swing_scores_0_to_2M_dict)
swing_scores_df_2M_to_4M = get_swing_df(swing_scores_2M_to_4M_dict)
swing_scores_df_4M_to_6M = get_swing_df(swing_scores_4M_to_6M_dict)
swing_scores_df_6M_to_8M = get_swing_df(swing_scores_6M_to_8M_dict)


In [12]:
def apply_filters_swing_df(swing_df, uid_degree_df, SWING_SCORE_THRESHOLD, UID_DEGREE_THRESHOLD):
        swing_df_score_filtered = swing_df[swing_df.score > SWING_SCORE_THRESHOLD]
        uid_degree_df.columns= ['uid', 'uid_degree']
        swing_df_score_filtered_merged_degree = pd.merge(swing_df_score_filtered, uid_degree_df, on='uid')

        swing_df_score_filtered_degree_filtered = swing_df_score_filtered_merged_degree[swing_df_score_filtered_merged_degree.uid_degree < UID_DEGREE_THRESHOLD]

        ##Merge with uid to url mapping
        #Merge swing_0_to_2M_df_filter_self_connections with uid_df
        swing_df_score_filtered_degree_filtered_uid_merged = pd.merge(swing_df_score_filtered_degree_filtered, uid_df, left_on='uid', right_on='uid')
        swing_df_score_filtered_degree_final = pd.merge(swing_df_score_filtered_degree_filtered_uid_merged, uid_df, left_on='tid', right_on='uid')
        del swing_df_score_filtered_degree_final['uid_y']
        return swing_df_score_filtered_degree_final



In [19]:
SWING_SCORE_THRESHOLD = 0.01
UID_DEGREE_THRESHOLD = 50

swing_scores_df_0_to_2M_filtered = apply_filters_swing_df(swing_scores_df_0_to_2M, uid_degree_df, SWING_SCORE_THRESHOLD, UID_DEGREE_THRESHOLD)
swing_scores_df_2M_to_4M_filtered = apply_filters_swing_df(swing_scores_df_2M_to_4M, uid_degree_df, SWING_SCORE_THRESHOLD, UID_DEGREE_THRESHOLD)
swing_scores_df_4M_to_6M_filtered = apply_filters_swing_df(swing_scores_df_4M_to_6M, uid_degree_df, SWING_SCORE_THRESHOLD, UID_DEGREE_THRESHOLD)
swing_scores_df_6M_to_8M_filtered = apply_filters_swing_df(swing_scores_df_6M_to_8M, uid_degree_df, SWING_SCORE_THRESHOLD, UID_DEGREE_THRESHOLD)

In [20]:
swing_scores_df_all = pd.concat([swing_scores_df_0_to_2M_filtered, swing_scores_df_2M_to_4M_filtered, swing_scores_df_4M_to_6M_filtered, swing_scores_df_6M_to_8M_filtered])

In [21]:
len(swing_scores_df_all)

22742316

In [22]:
#Drop duplicates of (uid, tid)
swing_scores_df_all_deduped = swing_scores_df_all.drop_duplicates(subset=['uid_x', 'tid'], keep='first')

In [24]:
len(swing_scores_df_all_deduped)

22742276

In [25]:
swing_scores_df_all_train = swing_scores_df_all[['uid_x', 'tid', 'score']]
swing_scores_df_all_train.columns  = ['uid', 'tid', 'weight']
swing_scores_df_all_train['weight']= 1.33

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [31]:
##Intersection of swing_scores_df_all_train and train_set_df

intersection_of_train_set_df_and_swing_scores_df_all_train = pd.merge(train_set_df, swing_scores_df_all_train, on= ('uid', 'tid'))

In [32]:
len(intersection_of_train_set_df_and_swing_scores_df_all_train)

11409643

In [26]:
len(swing_scores_df_all_train)

22742316

In [27]:
#concat with train_set_df
swing_df_train_set_df_merged_with_duplicates = pd.concat([train_set_df, swing_scores_df_all_train] )

#Drop duplicates based on uid, tid keep = 'first'
swing_df_train_set_df_merged = swing_df_train_set_df_merged_with_duplicates.drop_duplicates(subset=['uid', 'tid'], keep = 'first')

In [28]:
swing_df_train_set_df_merged.uid = swing_df_train_set_df_merged.uid.astype(int)
swing_df_train_set_df_merged.tid = swing_df_train_set_df_merged.tid.astype(int)
swing_df_train_set_df_merged.weight = swing_df_train_set_df_merged.weight.astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [30]:
len(swing_df_train_set_df_merged)

48721372

In [29]:
swing_df_train_set_df_merged.tid.nunique()

7384054

In [53]:
max(swing_df_train_set_df_merged.tid)

7384053

In [59]:
#Create edges from every uid to itself (Cross Edges)
all_uids = sorted(swing_df_train_set_df_merged.uid.unique())
cross_edges = list(zip(all_uids, all_uids))


In [63]:
cross_edges_df = pd.DataFrame(cross_edges, columns=['uid', 'tid'])

cross_edges_df.weight = 1.0 #Set weight to 1.0 for cross edges

In [65]:
##Concat swing_df_train_set_df_merged and cross_edges_df
swing_df_train_set_df_merged_with_cross_edges = pd.concat([swing_df_train_set_df_merged, cross_edges_df])

In [66]:
swing_df_train_set_df_merged_with_cross_edges

Unnamed: 0,uid,tid,weight
0,1,3433646,1.09861
1,100,1953971,1.09861
2,100,1957602,1.09861
3,100,2422703,1.09861
4,100,2999049,1.09861
...,...,...,...
7384049,7384049,7384049,
7384050,7384050,7384050,
7384051,7384051,7384051,
7384052,7384052,7384052,


In [67]:
swing_df_train_set_df_merged.to_csv("with_cross_edges_deduped_swing_df_train_set_df_merged_swing_threshold_001_uid_degree_less_than_50.csv", index=False, header=False)

In [64]:
swing_df_train_set_df_merged

Unnamed: 0,uid,tid,weight
0,1,3433646,1.09861
1,100,1953971,1.09861
2,100,1957602,1.09861
3,100,2422703,1.09861
4,100,2999049,1.09861
...,...,...,...
4163259,7384046,2939456,1.33000
4163262,7384050,2174252,1.33000
4163264,7384050,3804551,1.33000
4163273,7384053,3773208,1.33000


### Plot Distribution of Swing Scores

In [75]:
swing_score_distribution = swing_scores_df_all.score

print("Swing score distribution")
print("Min of swing score distribution: ", swing_score_distribution.min())
print("Max of swing score distribution: ", swing_score_distribution.max())
print("Mean of swing score distribution: ", swing_score_distribution.mean())


print("25th percentile of swing score distribution: ", swing_score_distribution.quantile(0.25))
print("Median of swing score distribution: ", swing_score_distribution.median())
print("75th percentile of swing score distribution: ", swing_score_distribution.quantile(0.75))
print("90th percentile of swing score distribution: ", swing_score_distribution.quantile(0.90))
print("95th percentile of swing score distribution: ", np.percentile(swing_score_distribution, 95))


Swing score distribution
Min of swing score distribution:  0.010000000000000002
Max of swing score distribution:  3013.315397216897
Mean of swing score distribution:  0.33621678826181917
25th percentile of swing score distribution:  0.019179867456702807
Median of swing score distribution:  0.044031992826661766
75th percentile of swing score distribution:  0.14472304166718053
90th percentile of swing score distribution:  0.4336556411488997
95th percentile of swing score distribution:  0.8342276850789766


In [80]:
##pd string no display width
pd.set_option('display.max_colwidth', -1)

  


In [82]:
swing_scores_df_all.sample(30)

Unnamed: 0,uid_x,tid,score,uid_degree,url_x,url_y
2522905,4066717,2867432,0.033074,793,https://desk.3gbizhi.com/,https://www.bilibili.com/index.php
3047771,5140406,2046696,0.027948,460,https://www.njlottery.com/,https://bible.usccb.org/daily-bible-reading
2828147,3987637,5563446,0.194526,75,https://bme.buaa.edu.cn/teacherInfo.aspx?catID=7&subcatID=35&curID=9922,http://ia.cas.cn/sourcedb_ia_cas/cn/iaexpert/200908/t20090804_2310535.html
3850087,4042608,2876506,0.028463,809,https://connect.facebook.com/,https://www.bilibili.com/video/BV1Jy4y1W7SN/
3691214,4785999,5174334,0.066804,51,https://www.cnblogs.com/Lints/p/11506902.html,https://www.php.cn/mysql-tutorials-442866.html
2738310,4943801,2853988,0.131349,687,https://www.google.co.za/,https://www.bandlab.com/
2742836,3307781,5641221,0.010223,885,https://www.onlinedown.net/soft/3610.htm,http://www.uqidong.com/uefi.html
4216772,6953492,5108579,0.048113,274,https://www.mlb.com/dodgers,https://www.mlb.com/dodgers/ballpark/information/guide
5479298,4820449,6933134,0.188509,146,https://www.curseforge.com/minecraft/modpacks,https://www.mcmod.cn/modpack/186.html
1577926,5465893,7372106,0.230621,891,https://zhidao.baidu.com/question/1957721455356528740.html,https://zhuanlan.zhihu.com/p/481745690


In [77]:
##Swing degree threshold 50, score threshold 0.01
"""
SAVE_FILTERED_SWING_0_to_2M_PATH = '/home/core/shahjaidev/SWING/3d_0_to_2M_Swing_Threshold_001_Degree_Threshold_50.csv'
swing_0_to_2M_df_train_merged_final.to_csv(SAVE_FILTERED_SWING_0_to_2M_PATH, index=False)"""