# part 4 - reprepering the data for better results

In [69]:
import numpy as np
import pandas as pd

data= pd.read_parquet('data_for_starship_2022.snappy.parquet', engine='pyarrow')

In [7]:
##Entropy
def entropy(Y):
    """
    Also known as Shanon Entropy
    Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
    """
    unique, count = np.unique(Y, return_counts=True, axis=0)
    prob = count/len(Y)
    en = np.sum((-1)*prob*np.log2(prob))
    return en


#Joint Entropy
def jEntropy(Y,X):
    """
    H(Y;X)
    Reference: https://en.wikipedia.org/wiki/Joint_entropy
    """
    YX = np.c_[Y,X]
    return entropy(YX)

#Conditional Entropy
def cEntropy(Y, X):
    """
    conditional entropy = Joint Entropy - Entropy of X
    H(Y|X) = H(Y;X) - H(X)
    Reference: https://en.wikipedia.org/wiki/Conditional_entropy
    """
    return jEntropy(Y, X) - entropy(X)


#Information Gain
def gain(Y, X):
    """
    Information Gain, I(Y;X) = H(Y) - H(Y|X)
    Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition
    """
    return entropy(Y) - cEntropy(Y,X)

## non array features

In [70]:
data.drop([ 'user_iab_category_ids', 'user_iab_category_clicks',
       'user_iab_category_recs', 'user_syndicator_ids',
       'user_syndicator_clicks', 'user_syndicator_recs', 'user_item_ids',
       'user_item_clicks','target_named_entities_ids'], axis=1,inplace= True)

In [71]:
# the calculation is on numbers so we turn all features that have object code into numerical code
cols= data.select_dtypes(include=['object']).columns

for c in cols:
    data[c] = data[c].astype("category").cat.codes

In [72]:
# create conditional entropy and mutual information dict for non-array features
from sklearn import metrics

non_arrays_entropy = {}
non_arrays_mutual_info={}
for c in data.columns:
    non_arrays_entropy[c] = cEntropy(data[c].values, data['is_click'].values)/entropy(data[c].values)
    non_arrays_mutual_info[c] = metrics.mutual_info_score(data[c].values, data['is_click'].values)

  non_arrays_entropy[c] = cEntropy(data[c].values, data['is_click'].values)/entropy(data[c].values)


In [73]:
# remove the iab category ids with low mutual infarmaton and high conditional entropy
remove_col_non_array1 = {k:v for k,v in non_arrays_mutual_info.items() if v<0.0001}
remove_col_non_array2 = {k:v for k,v in non_arrays_entropy.items() if v>0.99}
remove_col_non_array =  np.unique(list(remove_col_non_array1.keys()) + list(remove_col_non_array2.keys()) )

In [75]:
remove_col_non_array

array(['city', 'day_of_week_v2', 'empiric_clicks', 'gmt_offset',
       'iab_category', 'prev_syndicator_recs',
       'publisher_syndicator_empiric_clicks',
       'publisher_target_empiric_clicks', 'region', 'site_visitor',
       'target_item_type_string', 'target_item_upper_taxonomy',
       'time_of_day_v2', 'user_recs_v2'], dtype='<U35')

In [76]:
data.drop(remove_col_non_array, axis=1,inplace= True)

In [77]:
### Turn the categoricals features into one-hot encoding
categorical_features=['quality_level_string','browser_platform_string', 'title_detected_language', ]
for c in categorical_features:
    df1= pd.get_dummies(data[c] , prefix=c)
    data.drop([c], axis=1, inplace=True)
    data = pd.concat([data, df1], axis=1)

In [80]:
data.to_parquet('ready data part 4/non_array.snappy.parquet',
              compression='snappy',engine='pyarrow')

## iab category ctr

In [1]:
import numpy as np
import pandas as pd

data = pd.read_parquet('data_for_starship_2022.snappy.parquet', engine='pyarrow',columns=['is_click', 'user_iab_category_ids', 'user_iab_category_clicks','user_iab_category_recs'])

In [2]:
category_ids_arrys = data['user_iab_category_ids']
category_ids_arrys=category_ids_arrys.to_numpy()
all_items_id=np.concatenate(category_ids_arrys, axis=None)
category_ids = pd.DataFrame(all_items_id).value_counts().index.to_list()
index_list = []
for cell in category_ids:
    index_list.append(cell[0])
category_ctr_data = pd.DataFrame(0.0, index=np.arange(len(data)), columns=index_list)
category_ctr_data.shape

(2500000, 92)

In [3]:
category_ids_arrys = data['user_iab_category_ids']
category_clicks_arrys = data['user_iab_category_clicks']
category_recs_arrys = data['user_iab_category_recs']

for row_index in range(len(category_ids_arrys)):
    id_vals = category_ids_arrys[row_index]
    for i in range(len(id_vals)):
        if float(category_recs_arrys[row_index][i]) == 0:
            category_ctr_data.at[row_index, id_vals[i]] = 0
        else:
            category_ctr_data.at[row_index, id_vals[i]] = float(category_clicks_arrys[row_index][i])/float(category_recs_arrys[row_index][i]) 
category_ctr_data = category_ctr_data.add_suffix('_c_ctr')

In [4]:
category_ctr_data.to_parquet('category_ctr_data.snappy.parquet', compression='snappy',engine='pyarrow')

In [9]:
from sklearn import metrics

iab_category_entropy = {}
iab_category_mutual_info={}
for c in category_ctr_data.columns:
    iab_category_entropy[c] = cEntropy(category_ctr_data[c].values, data['is_click'].values)/entropy(category_ctr_data[c].values)
    iab_category_mutual_info[c] = metrics.mutual_info_score(category_ctr_data[c].values, data['is_click'].values)





  iab_category_entropy[c] = cEntropy(category_ctr_data[c].values, data['is_click'].values)/entropy(category_ctr_data[c].values)
  iab_category_entropy[c] = cEntropy(category_ctr_data[c].values, data['is_click'].values)/entropy(category_ctr_data[c].values)
  iab_category_entropy[c] = cEntropy(category_ctr_data[c].values, data['is_click'].values)/entropy(category_ctr_data[c].values)
  iab_category_entropy[c] = cEntropy(category_ctr_data[c].values, data['is_click'].values)/entropy(category_ctr_data[c].values)
  iab_category_entropy[c] = cEntropy(category_ctr_data[c].values, data['is_click'].values)/entropy(category_ctr_data[c].values)
  iab_category_entropy[c] = cEntropy(category_ctr_data[c].values, data['is_click'].values)/entropy(category_ctr_data[c].values)


In [10]:
import pickle

with open('ready data part 4/iab_category_entropy.pickle', 'wb') as handle:
    pickle.dump(iab_category_entropy, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('ready data part 4/iab_category_mutual_info.pickle', 'wb') as handle:
    pickle.dump(iab_category_mutual_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
# remove the features with low mutual infarmaton and high conditional entropy
remove_col_iab_category1 = {k:v for k,v in iab_category_mutual_info.items() if v<0.0001}
remove_col_iab_category2 = {k:v for k,v in iab_category_entropy.items() if v>0.99}
remove_col_iab_category =  np.unique(list(remove_col_iab_category1.keys()) + list(remove_col_iab_category2.keys()) )

In [14]:
category_ctr_data.drop(remove_col_iab_category, axis=1,inplace= True)

In [15]:
category_ctr_data.to_parquet('ready data part 4/category_ctr.snappy.parquet',
              compression='snappy',engine='pyarrow')

## user item clicks
#### there are many item so the conditional entropy and mutual information calculation is taking too long. first we will create a data frame with the items that appear frequently.

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics

data= pd.read_parquet('data_for_starship_2022.snappy.parquet', engine='pyarrow',columns=[ 'user_item_ids',
       'user_item_clicks','is_click'] )

In [2]:
item_ids = np.concatenate(data['user_item_ids'].to_numpy() , axis= None)
unique, counts = np.unique(item_ids, return_counts=True)

In [3]:
sort_order = counts.argsort()
sorted_counts = counts[sort_order]
sorted_unique = unique[sort_order]

In [4]:
first = int(len(sorted_unique)- (len(sorted_unique)/2000))
insertes_columns=sorted_unique[first:]

In [5]:
items_clicks_data = pd.DataFrame(0.0, index=np.arange(len(data)), columns=insertes_columns)

item_columms_set= set(insertes_columns)
for row_index in range(len(data)):
    val = data['user_item_ids'][row_index]
    for i in range(len(val)):
        if val[i] in item_columms_set:
            items_clicks_data.at[row_index, val[i]] = float(data['user_item_clicks'][row_index][i])
items_clicks_data = items_clicks_data.add_suffix('_i_clicks')

In [6]:
items_clicks_data.to_parquet('ready data part 4/items_clicks_data.snappy.parquet',
              compression='snappy',engine='pyarrow')

#### now we can calculate the conditional entropy

In [11]:
#items_clicks_data= pd.read_parquet('items_clicks_data.snappy.parquet', engine='pyarrow')
items_clicks_entropy = {}
items_clicks_mutual_info={}
for c in items_clicks_data.columns:
    items_clicks_entropy[c] = cEntropy(items_clicks_data[c].values,
                                       data['is_click'].values)/entropy(items_clicks_data[c].values)
    items_clicks_mutual_info[c] = metrics.mutual_info_score(items_clicks_data[c].values,
                                                            data['is_click'].values)

In [26]:
import pickle

with open('items_clicks_entropy.pickle', 'wb') as handle:
    pickle.dump(items_clicks_entropy, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('items_clicks_mutual_info.pickle', 'wb') as handle:
    pickle.dump(items_clicks_mutual_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
remove_col_item_clicks1 = {k:v for k,v in items_clicks_mutual_info.items() if v<0.0001}
len(remove_col_item_clicks1)

188

In [17]:
remove_col_item_clicks2 = {k:v for k,v in items_clicks_entropy.items() if v>0.99}
len(remove_col_item_clicks2)

38

In [20]:
# remove the features with low mutual infarmaton and high conditional entropy
remove_col_item_clicks1 = {k:v for k,v in items_clicks_mutual_info.items() if v<0.0001}
remove_col_item_clicks2 = {k:v for k,v in items_clicks_entropy.items() if v>0.99}
remove_col_item_clicks =  np.unique(list(remove_col_item_clicks1.keys()) + list(remove_col_item_clicks2.keys()))
len(remove_col_item_clicks)

188

In [27]:
items_clicks_data.drop(remove_col_item_clicks, axis=1,inplace= True)

In [28]:
items_clicks_data.to_parquet('ready data part 4/items_clicks.snappy.parquet',
              compression='snappy',engine='pyarrow')

In [3]:
items_ids_arrys = data['user_item_ids'].to_numpy() 
all_items_id=np.unique(np.concatenate(items_ids_arrys, axis=None))
user_item_c_entropy_vals={}
user_item_mutual_info_vals={}

In [None]:
for i in range (11135):
    curr_cols = all_items_id[i:66*(i+1)]
    items_clicks_data = pd.DataFrame(0.0, index=np.arange(len(data)), columns=curr_cols)
    
    item_columms_set= set(curr_cols)
    for row_index in range(len(items_ids_arrys)):
        val =  data['user_item_ids'][row_index]
        for i in range(len(val)):
            if val[i] in item_columms_set:
                items_clicks_data.at[row_index, val[i]] = float( data['user_item_clicks'][row_index][i])   
    #items_clicks_data = items_clicks_data.add_suffix('_item_clicks')
    
    for c in items_clicks_data.columns:
        user_item_c_entropy_vals[c] = cEntropy(items_clicks_data[c].values, data['is_click'].values)
        
    for c in items_clicks_data.columns:
        user_item_mutual_info_vals[c] = metrics.mutual_info_score(items_clicks_data[c].values, data['is_click'].values)

In [None]:
import pickle
# save dictionary to pickle file
with open('user_item_conditional_entropy.pickle', 'wb') as file:
    pickle.dump(user_item_c_entropy_vals, file, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('user_item_mutual_information.pickle', 'wb') as file:
    pickle.dump(user_item_mutual_info_vals, file, protocol=pickle.HIGHEST_PROTOCOL)

## Syndicator 

In [None]:
import numpy as np
import pandas as pd 

data = pd.read_parquet('D:\starship\old data\data_for_starship_2022.snappy.parquet', engine='pyarrow',columns=['user_syndicator_ids',
       'user_syndicator_clicks', 'user_syndicator_recs','is_click'])
data.head()
syndicator_ids_arrys = data['user_syndicator_ids'].to_numpy()
all_syndicator_ids=np.concatenate(syndicator_ids_arrys, axis=None)
syndicator_ids =np.unique(all_syndicator_ids)
len(syndicator_ids)

In [None]:
import time
import pickle

clicks_dict = {}
recks_dict = {}
start = time.time()
sindicator_dict={}
for chunk in range(211):
    place= chunk * 97
    syndicator_clicks_data = pd.DataFrame(0.0, index=np.arange(2500000),columns = syndicator_ids[place : place + 97 ])
    syndicator_recks_data = pd.DataFrame(0.0, index=np.arange(2500000),columns = syndicator_ids[place : place + 97 ])
    sin_set = set(syndicator_ids[place : place + 97 ])

    for syndicator in sin_set:
        counter = 0
        for i in range (250000):
            index = np.where (syndicator_ids_arrys[i] == syndicator)
            if index[0].size > 0:
                syndicator_clicks_data[syndicator].iloc[i] = data['user_syndicator_clicks'].iloc[i][index[0][0]]
                syndicator_recks_data[syndicator].iloc[i] = data['user_syndicator_recs'].iloc[i][index[0][0]]
        clicks_dict[syndicator] =  cEntropy(syndicator_clicks_data[syndicator].values, data['is_click'].values) / entropy(syndicator_clicks_data[syndicator].values)
        recks_dict[syndicator] =  cEntropy(syndicator_recks_data[syndicator].values, data['is_click'].values) / entropy(syndicator_recks_data[syndicator].values)

    
    print(chunk, time.time()-start)
    start = time.time()
name = 'clicks_normalize_entropy_dict.pkl'
a_file = open(name, "wb")
pickle.dump(clicks_dict, a_file)
a_file.close()

name = 'recks_normalize_entropy_dict.pkl'
a_file = open(name, "wb")
pickle.dump(recks_dict, a_file)
a_file.close()

In [None]:
import time
import pickle
from sklearn import metrics

sindicator_clicks_mutual = {}
sindicator_recks_mutual = {}
start = time.time()
sindicator_dict={}
for chunk in range(211):
    place= chunk * 97
    syndicator_clicks_data = pd.DataFrame(0.0, index=np.arange(2500000),columns = syndicator_ids[place : place + 97 ])
    syndicator_recks_data = pd.DataFrame(0.0, index=np.arange(2500000),columns = syndicator_ids[place : place + 97 ])
    sin_set = set(syndicator_ids[place : place + 97 ])
    counter = 0
    for syndicator  in sin_set:
        
        for i in range (250000):
            index = np.where (syndicator_ids_arrys[i] == syndicator)
            if index[0].size > 0:
                syndicator_clicks_data[syndicator].iloc[i] = data['user_syndicator_clicks'].iloc[i][index[0][0]]
                syndicator_recks_data[syndicator].iloc[i] = data['user_syndicator_recs'].iloc[i][index[0][0]]
        sindicator_clicks_mutual[syndicator] = metrics.mutual_info_score(syndicator_clicks_data[syndicator].values, data['is_click'].values)
        sindicator_recks_mutual[syndicator] = metrics.mutual_info_score(syndicator_recks_data[syndicator].values, data['is_click'].values)
        print(chunk,counter, time.time() - start)
        counter = counter + 1
    print(chunk, time.time()-start)
    start = time.time()
  
name = 'clicks_mutual_dict.pkl'
a_file = open(name, "wb")
pickle.dump(sindicator_clicks_mutual, a_file)
a_file.close()

name = 'recks_mutual_dict.pkl'
a_file = open(name, "wb")
pickle.dump(sindicator_recks_mutual, a_file)
a_file.close()

## target_named_entities_ids
#### Multi-hot encodinf to the feature target_named_entities_ids

In [29]:
data= pd.read_parquet('data_for_starship_2022.snappy.parquet', engine='pyarrow',columns=[ 'target_named_entities_ids','is_click'] )
user_item_ids=np.concatenate(data['target_named_entities_ids'].to_numpy(), axis=None)
unique, counts = np.unique(user_item_ids, return_counts=True)

In [30]:
sort_order = counts.argsort()
sorted_counts = counts[sort_order]
sorted_unique = unique[sort_order]

In [33]:
first = int(len(sorted_unique)- (len(sorted_unique)/100))
insertes_columns=sorted_unique[first:]

In [35]:
target_named_entities_ids_data = pd.DataFrame(0, index=np.arange(len(data)), columns=insertes_columns)

columms_set= set(insertes_columns)
for row_index in range(len(target_named_entities_ids_data)):
    val =  data['target_named_entities_ids'][row_index]
    for i in range(len(val)):
        if val[i] in columms_set:
            target_named_entities_ids_data.at[row_index, val[i]] = 1
target_named_entities_ids_data = target_named_entities_ids_data.add_suffix('_entities_ids')

In [36]:
target_named_entities_ids_data.to_parquet('ready data part 4/target_named_entities_ids_most_common.snappy.parquet',
              compression='snappy',engine='pyarrow')

#### now we can calculate the conditional entropy

In [37]:
target_named_entities_entropy = {}
target_named_entities_mutual_info={}
for c in target_named_entities_ids_data.columns:
    target_named_entities_entropy[c] = cEntropy(target_named_entities_ids_data[c].values, 
                                                data['is_click'].values)/entropy(target_named_entities_ids_data[c].values)
    target_named_entities_mutual_info[c] = metrics.mutual_info_score(target_named_entities_ids_data[c].values, 
                                                                     data['is_click'].values)

In [41]:
import pickle

with open('target_named_entities_entropy.pickle', 'wb') as handle:
    pickle.dump(target_named_entities_entropy, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('target_named_entities_mutual_info.pickle', 'wb') as handle:
    pickle.dump(target_named_entities_mutual_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
# remove the features with low mutual infarmaton and high conditional entropy
remove_col_item_clicks1 = {k:v for k,v in target_named_entities_mutual_info.items() if v<0.0001}
remove_col_item_clicks2 = {k:v for k,v in target_named_entities_entropy.items() if v>0.99}
remove_col_item_clicks =  np.unique(list(remove_col_item_clicks1.keys()) + list(remove_col_item_clicks2.keys()) )

In [48]:
target_named_entities_ids_data.drop(remove_col_item_clicks, axis=1,inplace= True)

In [49]:
target_named_entities_ids_data.to_parquet('ready data part 4/target_named_entities_ids.snappy.parquet',
              compression='snappy',engine='pyarrow')

## concat everything together

In [1]:
import pandas as pd
import numpy as np
import glob
import os

path = r'C:\Users\שחר\STARSHIP\ready data part 4'
all_files = glob.glob(path + "/*.parquet")
print(all_files)
df = pd.concat(map(pd.read_parquet, all_files), axis=1)

['C:\\Users\\שחר\\STARSHIP\\ready data part 4\\category_ctr.snappy.parquet', 'C:\\Users\\שחר\\STARSHIP\\ready data part 4\\items_clicks.snappy.parquet', 'C:\\Users\\שחר\\STARSHIP\\ready data part 4\\non_array.snappy.parquet', 'C:\\Users\\שחר\\STARSHIP\\ready data part 4\\target_named_entities_ids.snappy.parquet']


In [4]:
df.to_parquet('ready data part 4/data.snappy.parquet',
              compression='snappy',engine='pyarrow')

## cat boost 100-3 over the new data set

In [5]:
from sklearn.model_selection import train_test_split

y = df['is_click']
del df['is_click']

X_train, X_test, y_train, y_test = train_test_split( df, y, test_size=0.2, random_state=42)

In [6]:
X_train.to_csv('ready data part 4/X_train_data.csv',index=False)
X_test.to_csv('ready data part 4/X_test_data.csv',index=False)
y_train.to_csv('ready data part 4/y_train_data.csv',index=False)
y_test.to_csv('ready data part 4/y_test_data.csv',index=False)

In [7]:
def evaluate(test_set):
    y_pred = cat_model.predict(test_set)
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("Recall:",metrics.recall_score(y_test, y_pred))
    print("F-Measure:",metrics.f1_score(y_test, y_pred))
    print("Precision:",metrics.precision_score(y_test, y_pred))

In [8]:
from sklearn import metrics
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier()
cat_model.fit(X_train  , y_train.values.ravel(),verbose=False)
evaluate(X_test)

Accuracy: 0.78457
Recall: 0.8408310467112758
F-Measure: 0.7955734627917702
Precision: 0.7549390030867644
