In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.5.2

Found existing installation: scikit-learn 1.6.0
Uninstalling scikit-learn-1.6.0:
  Successfully uninstalled scikit-learn-1.6.0
Collecting scikit-learn==1.5.2
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
Successfully installed scikit-learn-1.5.2


In [3]:
import sklearn
sklearn.__version__

'1.5.2'

In [4]:
!sudo apt-get update
!sudo apt-get install python3.8.20 python3.8.20-distutils -y
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8.20 1
!sudo update-alternatives --config python3
!wget https://bootstrap.pypa.io/get-pip.py
!python3 get-pip.py


Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,199 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,560 kB]
Get:13 http://security.ubuntu.com/ubuntu j

###Import and installing packages

In [5]:
import numpy as np
import pandas as pd
import sqlite3
import xgboost
import json
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedShuffleSplit

from google.colab import data_table
data_table.enable_dataframe_formatter()

import warnings
warnings.filterwarnings('ignore')

##Functions:

In [6]:
def target_extract(conn):
    """
    This function return The Target of each Device_ID.

    Parameters
    ----------
    conn: connection
        A connection object to the database.

    Returns
    -------
    dataframe
        A dataframe of the labels.
    """
    cur = conn.cursor()
    sql = '''SELECT DISTINCT Device_ID, Target
             FROM data;
             '''
    cur.execute(sql)
    target = pd.DataFrame(cur.fetchall(), columns = ['Device_ID', 'Target'])
    target['Target'] = target['Target'].astype('int')
    return target

In [7]:
def rename_and_16_convert(dataset,prefix):
    """
    Processing data to reduce memory and creating unique columns names for features.

    Parameters
    ----------
    dataset: dataframe
        A dataframe columns includes Device_ID and features to process.
    prefix : str
        A string to concatenate to the feature names.

    Returns
    -------
    dataframe
        The processed data inclued Device_ID column.
    """
    col_dataset = list(dataset.drop(['Device_ID'], axis=1).columns)
    df_device = dataset['Device_ID']

    dataset.loc[:,col_dataset] *=1000
    dataset = dataset.loc[:,col_dataset].astype('float16')
    dataset = pd.concat([dataset, df_device], axis=1)

    new_columns_name = {n: f'{prefix}_{n}'for n in col_dataset}

    dataset.rename(columns=new_columns_name, inplace=True)
    return dataset

In [8]:
def relative_domain(conn, device_list):
    """
    Feature engeinering: For each Device_ID calculate the proportions of all the domain_Name he entered.

    Parameters
    ----------
    conn: connection
        A connection object to the database.
    device_list : list
        A list of Device_IDs to calculate their proportions.

    Returns
    -------
    dataframe
        A dataframe with the proportions for each Device_IDs and Domain_Name.
    """
    list_device_str = ', '.join(map(str, device_list))
    cur = conn.cursor()
    sql = f'''SELECT DISTINCT
                Device_ID,
                Domain_Name,
                Domain_Name_count*1.0 / SUM(Domain_Name_count) OVER (PARTITION BY Device_ID) AS relative_domain
            FROM (
              SELECT Device_ID, Domain_Name, COUNT(*) as Domain_Name_count
            		FROM data
            		WHERE Device_ID IN (''' +list_device_str+''')
                    GROUP BY Device_ID, Domain_Name
            		) subquery;'''
    cur.execute(sql)
    df = pd.DataFrame(cur.fetchall(), columns = ['Device_ID','Domain_Name', 'relative_domain'])

    df = df.pivot_table(index='Device_ID', columns='Domain_Name', values='relative_domain', fill_value=0)
    df['Device_ID'] = df.index
    df.reset_index(inplace=True, drop=True)
    return df

In [9]:
def cls_proportion(conn, device_list):
    """
    Feature engeinering: For each Device_ID calculate the proportions of all the domain_cls he entered.

    Parameters
    ----------
    conn: connection
        A connection object to the database.
    device_list : list
        A list of Device_IDs to calculate their proportions.

    Returns
    -------
    dataframe
        A dataframe with the proportions for each Device_IDs and Domain_cls.
    """
    list_device_str = ', '.join(map(str, device_list))
    cur = conn.cursor()
    sql = '''SELECT
                    Device_ID,
                    Domain_cls,
                    CAST(count_cls AS REAL) / SUM(count_cls) OVER (PARTITION BY Device_ID) AS proportion
                    FROM
                    (SELECT Device_ID, Domain_cls , COUNT(*) AS count_cls
                    FROM (
                        SELECT Device_ID, Domain_cls1 AS Domain_cls FROM data WHERE (Domain_cls1 != 0 AND Device_ID IN (''' +list_device_str+'''))
                        UNION ALL
                        SELECT Device_ID, Domain_cls2 AS Domain_cls FROM data WHERE (Domain_cls2 != 0 AND Device_ID IN (''' +list_device_str+'''))
                        UNION ALL
                        SELECT Device_ID, Domain_cls3 AS Domain_cls FROM data WHERE (Domain_cls3 != 0 AND Device_ID IN (''' +list_device_str+'''))
                        UNION ALL
                        SELECT Device_ID, Domain_cls4 AS Domain_cls FROM data WHERE (Domain_cls4 != 0 AND Device_ID IN (''' +list_device_str+'''))
                    ) AS combined
                    WHERE Domain_cls!=0
                    GROUP BY Device_ID, Domain_cls
                    ORDER BY Device_ID, Domain_cls)
                    subquery;'''

    cur.execute(sql)
    df = pd.DataFrame(cur.fetchall(), columns = ['Device_ID','Domain_cls', 'proportion'])

    df = df.pivot_table(index='Device_ID', columns='Domain_cls', values='proportion', fill_value=0)
    df['Device_ID'] = df.index
    df.reset_index(inplace=True, drop=True)
    return df

In [10]:
def avg_relative_entrances_device_id(conn, hours_duration, device_list):
    """
    Feature engeinering: for each Device_ID calculation of the proportional hits according to the day's parts.
    Calculation of proportional hits: For each Device_ID, sum up the proportional hits for each day's part (calculated each day) and divide them by the number of days (all days of internet usage -queries).

    Parameters
    ----------
    conn: connection
        A connection object to the database.
    hours_duration : int
        The interval duration of each day's parts in hours (Day division to 24/'hours_duration' parts).
    device_list : list
        A list of Device_IDs to calculate their proportions.

    Returns
    -------
    dataframe
        A dataframe with the proportional hits for each Device_ID and time_range.
    """
    df_sum_relative = sum_relative_entrances_timerange(conn, hours_duration, device_list)

    all_desired_combinations = list(pd.MultiIndex.from_product([df_sum_relative['Device_ID'].unique(), range(int(24/hours_duration))], names=['Device_ID', 'time_range']))
    diff_to_add = set(all_desired_combinations).difference(set(df_sum_relative.apply(lambda row: (row['Device_ID'], row['time_range']), axis=1).to_list()))
    diff_to_add = [x +(0,) for x in diff_to_add]
    diff_to_add_df = pd.DataFrame(diff_to_add, columns = list(df_sum_relative.columns))
    df_sum_relative = pd.concat([df_sum_relative,diff_to_add_df], axis=0)
    df_sum_relative.reset_index(drop = True, inplace = True)

    df_days_count_train = count_day_device_id(conn, device_list)

    df = pd.merge(df_sum_relative, df_days_count_train, how ='left', on ='Device_ID')
    df['relative_part'] = df['sum_relative_part']/df['day_num']
    df.drop(['day_num','sum_relative_part'],axis=1, inplace = True)

    df = df.pivot_table(index='Device_ID', columns='time_range', values='relative_part', fill_value=0)
    df['Device_ID'] = df.index
    df.reset_index(inplace=True, drop=True)

    return df

In [11]:
def sum_relative_entrances_timerange(conn, hours_duration, device_list):
    """
    For each Device_ID, sum the proportional hits in each day according to the day's parts.

    Parameters
    ----------
    conn: connection
        A connection object to the database.
    hours_duration : int
        The interval duration of each day's parts in hours (Day division to 24/'hours_duration' parts).
    device_list:  list
        A list of Device_IDs to calculate their proportions.

    Returns
    -------
    dataframe
        A dataframe contains Device_ID, part of the day, and the sum of the proportional hits.
    """
    list_device_str = ', '.join(map(str, device_list))
    cur = conn.cursor()
    sql = f'''SELECT DISTINCT
                        Device_ID,
                        time_range,
                        SUM (relative_part) OVER (PARTITION BY Device_ID,time_range) AS sum_relative_part
                    FROM(

                            SELECT distinct
                                            Device_ID,
                                            date,
                                            time_range,
                                            CAST(COUNT(*) OVER (PARTITION BY Device_ID,date,time_range) AS REAL) / COUNT(*) OVER (PARTITION BY Device_ID,date) AS relative_part
                                        FROM
                                                    (SELECT
                                                            Device_ID,
                                                            Datetime,
                                                            strftime('%Y-%m-%d', Datetime) AS date,
                                                            (CAST(strftime('%H', Datetime) AS INTEGER) / {hours_duration}) AS time_range
                                                        FROM
                                                            data
                                                        WHERE
                                                            Device_ID IN (''' +list_device_str+''')
                                                    ) subquery
                        ) subquery
                        ;'''
    cur.execute(sql)
    df = pd.DataFrame(cur.fetchall(), columns = ['Device_ID','time_range', 'sum_relative_part'])
    return df

In [12]:
def count_day_device_id(conn, device_list):
    """
    This function counts the days with internet usage(queries) of each Device_ID.

    Parameters
    ----------
    conn: connection
        A connection object to the database.
    device_list:  list
        A list of Device_IDs to calculate their proportions.

    Returns
    -------
    dataframe
        A dataframe contains Device_ID and total days.
    """
    list_device_str = ', '.join(map(str, device_list))
    cur = conn.cursor()
    sql = f'''
                SELECT
                    Device_ID,
                    COUNT(DISTINCT strftime('%Y-%m-%d', Datetime)) AS day_num
                FROM
                    data
                WHERE
                    Device_ID IN (''' +list_device_str+''')
                GROUP BY
                    Device_ID
                ;'''
    cur.execute(sql)
    df = pd.DataFrame(cur.fetchall(), columns = ['Device_ID', 'day_num'])
    return df

In [13]:
def corresponding_columns_training_set(df_train_col_list, df):
    """
    This function checks the gaps between the features received as arguments and the data's columns. And changes the columns' data to be the same as those received as arguments.

    Parameters
    ----------
    df_train_col_list: list
        List of features from the training set
    df:  dataframe
        A dataset whose columns will be changed according to df_train_col_list.

    Returns
    -------
    dataframe
        A dataframe with columns compatible with those of the training set.
    """
    del_col = set(list(df.columns)) - set(df_train_col_list)
    df.drop(columns = del_col, inplace = True)
    diff_col = set(df_train_col_list)-set(list(df.columns))
    add_to_test = pd.DataFrame(0, index=np.arange(len(df)), columns=list(diff_col)).astype('float16')
    df = pd.concat([df, add_to_test], axis=1)
    return df

#Model

##Download the data and split it into training and validation sets.

In [14]:
!pip install gdown



Select the data for model training.
(note that running the notebook on the full dataset will not be possible in a standard collab)

In [15]:
dataset = "mini dataset" #@param ["full dataset", "mini dataset"]

print('You selected', dataset)
if dataset == "mini dataset":
    !gdown --fuzzy "https://drive.google.com/file/d/1NdgwhvUGxVxmv14GTEEgfvlzco_N5Ytc/view?usp=sharing"
    !unzip "/content/mini_training_set.zip"
    conn = sqlite3.connect("/content/mini_training_set.db")
else:
    !gdown --fuzzy "https://drive.google.com/file/d/1DdJFPcaOGa3grUdygEti8jaWjrReyeQU/view?usp=sharing"
    !unzip "/content/training_set.zip"
    conn = sqlite3.connect("/content/training_set.db")

You selected mini dataset


In [16]:
#Extract the target variable
target_df = target_extract(conn)

In [17]:
#Getting indexes for a partition that preserves the proportions of the data
s = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)

for train_index, test_index in s.split(target_df.Target.values,target_df.Target.values):
    train_target = target_df.iloc[train_index,:]
    test_target = target_df.iloc[test_index,:]

    train_device = list(train_target.Device_ID)
    test_device = list(test_target.Device_ID)

##Feature engineering:

Features engineering of 3 types to each Device_ID:
1. The proportional part of each Domain_Name.
2. The proportional part of each Domain_cls.
3. The proportional part of internet usage by time ranges of a day.

Select duration of day's bins (hours) for features engineering:

In [None]:
hours_duration = "3" # @param ["2", "3", "4", "6", "8"]
hours_duration = int(hours_duration)

###Training set

In [19]:
domain_name_feat = relative_domain(conn, train_device)
domain_name_feat = rename_and_16_convert(domain_name_feat,'Domain')

In [20]:
cls_name_feat = cls_proportion(conn, train_device)
cls_name_feat = rename_and_16_convert(cls_name_feat,'cls')

In [None]:
ts_feat = avg_relative_entrances_device_id(conn, hours_duration, train_device)
ts_feat = rename_and_16_convert(ts_feat,'ts')

In [None]:
df_train = pd.merge(domain_name_feat, cls_name_feat, how ='left', on ='Device_ID')
df_train = pd.merge(df_train, ts_feat, how ='left', on ='Device_ID')

In [23]:
df_train_columns = list(df_train.columns)

###Test set

In [24]:
domain_name_feat_test = relative_domain(conn, test_device)
domain_name_feat_test = rename_and_16_convert(domain_name_feat_test,'Domain')

In [25]:
cls_name_feat_test = cls_proportion(conn, test_device)
cls_name_feat_test = rename_and_16_convert(cls_name_feat_test,'cls')

In [None]:
ts_feat_test = avg_relative_entrances_device_id(conn, hours_duration, test_device)
ts_feat_test = rename_and_16_convert(ts_feat_test,'ts')

In [None]:
df_test = pd.merge(domain_name_feat_test, cls_name_feat_test, how ='left', on ='Device_ID')
df_test = pd.merge(df_test, ts_feat_test, how ='left', on ='Device_ID')

In [28]:
df_test = corresponding_columns_training_set(list(df_train.columns), df_test)

##Data preparation

Sort the datasets and the labels by Device_ID and delete it.


In [29]:
df_train.sort_values(by = ["Device_ID"], inplace = True)
df_train.reset_index(drop=True, inplace=True)

train_target.sort_values(by = ["Device_ID"], inplace = True)
train_target.reset_index(drop=True, inplace=True)

df_train_Device_ID = df_train["Device_ID"]
df_train.drop(columns = ["Device_ID"], inplace = True)
train_target.drop(columns = ["Device_ID"], inplace = True)

train_target['Target'] = train_target.Target.astype('category')

In [30]:
df_test.sort_values(by = ["Device_ID"], inplace = True)
df_test.reset_index(drop=True, inplace=True)

test_target.sort_values(by = ["Device_ID"], inplace = True)
test_target.reset_index(drop=True, inplace=True)

df_test_Device_ID = df_test["Device_ID"]
df_test.drop(columns = ["Device_ID"], inplace = True)
test_target.drop(columns = ["Device_ID"], inplace = True)

test_target['Target'] = test_target.Target.astype('category')

##Feature elimination & Model training

Select features by 'RFE' and use the fitted estimator as the model.

In [31]:
xgb_reg = xgboost.XGBRegressor(random_state=0, subsample=0.8, colsample_bytree=0.8, learning_rate= 0.1,
                               n_estimators= 150, max_depth=6, objective ='binary:logistic' ,eval_metric =roc_auc_score)
selector = RFE(xgb_reg, n_features_to_select=1000, step=20000)
selector = selector.fit(df_train, train_target)

In [32]:
best_features = list(df_train.columns[selector.support_])
test_prediction = selector.estimator_.predict(df_test[best_features])
print(f'The auc for validation set: {round(roc_auc_score(test_target.Target,test_prediction), 3)}')

The auc for validation set: 0.762


###Save model & Best features

In [33]:
# Now you can use XGBoost methods like this:
selector.estimator_.save_model('/content/XGB_model.json')

In [34]:
#Save best features
with open("/content/best_features.json", "w") as fp:
    json.dump(best_features, fp)

##Prepare submission

Attention!

Full submission includes the following files in a zip archive:


1.   model.py (must) - contains a class named "model". The class must have implementations of "load", "init" and "predict" functions:

    *   init - initialization function of the model class.
    *   load - a function that loads the model and model weights.
    *   predict - a function that receives one Device_ID each time (as a DataFrame) and returns a one value prediction.
    *   The file may contain other functions (within the class or outside of it)
    *   imports used by the class must be compatible with the permitted python packages.

2.   metadata (must)

    *   contain the command for running the model file - do not change this file
3.  model weights (optional)

    *   in this example, we demonstrate how to save a XGBoost regressor weights. however, these can be any kind of weights as long as they are compatible with the model and the permitted python packages.
    *   if the model depends on these weights, this file is mandatory.
4.  Helper_func.py (optional)
    *   This file contains helper functions. The file can have a different name as long as it is compatible with model.py
    *   if the model depends on these weights, this file is mandatory.

Running the following cells will generate a zip file with a valid submission for the competition.

In [35]:
%%writefile helper_func.py
import numpy as np
import pandas as pd


def relative_domain(data):
    """
    Feature engeinering: calculate the proportions of all the domain_Name.

    Parameters
    ----------
    data: dataframe
        Browsing data ('Domain_Name' column) of a single Device_ID.

    Returns
    -------
    dataframe
        A dataframe with the proportions for each Domain_Name.
    """
    df = data['Domain_Name'].value_counts(normalize=True)
    df = df.to_frame().T
    df.reset_index(inplace = True, drop = True)
    return df



def cls_proportion(data):
    """
    Feature engeinering: calculate the proportions of all the domain_cls.

    Parameters
    ----------
    data: dataframe
        Browsing data (domain classes' columns) of a single Device_ID.

    Returns
    -------
    dataframe
        A dataframe with the proportions for each Domain_cls.
    """
    combined_cls = data[['Domain_cls1', 'Domain_cls2', 'Domain_cls3', 'Domain_cls4']].values.flatten()
    # Filter out the zeros
    combined_cls = combined_cls[combined_cls != 0]
    df = pd.Series(combined_cls).value_counts(normalize=True)
    df = df.to_frame().T
    df.reset_index(inplace = True, drop = True)
    return df


def avg_relative_entrances_device_id(data, hours_duration):
    """
    Feature engeinering: calculation of the proportional hits according to the day's parts.
    Calculation of proportional hits: sum up the proportional hits for each day's part (calculated each day) and divide them by the number of days (all days of internet usage -queries).

    Parameters
    ----------
    data: dataframe
        Browsing data ('Datetime' column) of a single Device_ID.
    hours_duration : int
        The interval duration of each day's parts in hours (Day division to 24/'hours_duration' parts).

    Returns
    -------
    dataframe
        A dataframe with the proportional hits for each time_range.
    """

    df = pd.to_datetime(data['Datetime'])
    df = df.to_frame()
    df['Datetime'] = df['Datetime'].dt.tz_convert('UTC').dt.tz_localize(None)

    part_length = hours_duration
    num_parts = 24 // part_length

    # Assign part of the day to each timestamp
    df['part_of_day'] = df['Datetime'].dt.hour // part_length

    # Group by date and part of the day, then calculate proportions
    date_groups = df.groupby([df['Datetime'].dt.date, 'part_of_day']).size().unstack(fill_value=0)
    date_groups = date_groups.divide(date_groups.sum(axis=1), axis=0)

    # Add missing parts of the day
    for i in range(num_parts):
        if i not in date_groups.columns:
            date_groups[i] = 0
    date_groups = date_groups.sort_index(axis=1)

    average_proportions = date_groups.sum(axis=0)/date_groups.shape[0]
    return average_proportions.to_frame().T


def corresponding_columns_training_set(df_train_col_list, df):
    """
    This function checks the gaps between the features received as arguments and the data's columns, and changes the columns' data to be the same as those received as arguments.

    Parameters
    ----------
    df_train_col_list: list
        List of features from the training set
    df:  dataframe
        A dataset whose columns will be changed according to df_train_col_list.

    Returns
    -------
    dataframe
        A dataframe with columns compatible with those of the training set.
    """
    del_col = set(list(df.columns)) - set(df_train_col_list)
    df.drop(columns = del_col, inplace = True)
    diff_col = set(df_train_col_list)-set(list(df.columns))
    add_to_test = pd.DataFrame(0, index=np.arange(len(df)), columns=list(diff_col)).astype('float16')
    df = pd.concat([df, add_to_test], axis=1)
    return df


def rename_and_16_convert(dataset, prefix):
    """
    Processing data to reduce memory and creating unique columns names for features.

    Parameters
    ----------
    dataset: dataframe
        A dataframe columns includes Device_ID and features to process.
    prefix : str
        A string to concatenate to the feature names.

    Returns
    -------
    dataframe
        The processed data inclued Device_ID column.
    """
    col_dataset = list(dataset.columns)
    dataset *=1000
    dataset = dataset.astype('float16')

    new_columns_name = {n: f'{prefix}_{n}'for n in col_dataset}

    dataset.rename(columns=new_columns_name, inplace=True)
    return dataset

Writing helper_func.py


In [36]:
%%writefile model.py

import json
import xgboost
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from helper_func import *
import os


class model:
    def __init__(self):
        '''
        Init the model
        '''
        self.model  = xgboost.XGBRegressor(seed=0, subsample=0.8, colsample_bytree=0.8, learning_rate= 0.1, n_estimators= 150, max_depth=6, objective ='binary:logistic',eval_metric =roc_auc_score )
        self.hours_duration = 3
        self.best_features = []


    def load(self, dir_path):
        '''
        Edit this function to fit your model.

        This function should load the model that you trained on the train set.
        :param dir_path: A path for the folder the model is submitted
        '''
        model_name = 'XGB_model.json'
        model_file = os.path.join(dir_path, model_name)
        self.model.load_model(model_file)

        best_features_name = 'best_features.json'
        best_features_file = os.path.join(dir_path, best_features_name)
        with open(best_features_file, "r") as fp:
            self.best_features = json.load(fp)

    def predict(self, X):
        '''
        Edit this function to fit your model.

        This function should provide predictions of labels on (test) data.
        Make sure that the predicted values are in the correct format for the scoring
        metric.
        domain_name_feat_X, cls_name_feat_X, ts_feat_X : our code for add features to the data before prediction.
        :param X: is DataFrame with the columns - 'Datetime', 'URL', 'Domain_Name','Domain_cls1', 'Domain_cls2', 'Domain_cls3', 'Domain_cls4'.
        :return: a float value of the prediction for class 1.
        '''

        domain_name_feat_X = relative_domain(X[['Domain_Name']])
        domain_name_feat_X = rename_and_16_convert(domain_name_feat_X,'Domain')

        cls_name_feat_X = cls_proportion(X[['Domain_cls1', 'Domain_cls2', 'Domain_cls3', 'Domain_cls4']])
        cls_name_feat_X = rename_and_16_convert(cls_name_feat_X,'cls')

        ts_feat_X = avg_relative_entrances_device_id(X[['Datetime']], self.hours_duration)
        ts_feat_X = rename_and_16_convert(ts_feat_X,'ts')

        df_X = pd.concat([domain_name_feat_X, cls_name_feat_X, ts_feat_X], axis=1)

        df_X = corresponding_columns_training_set(self.best_features, df_X)

        y = self.model.predict(df_X[self.best_features])

        return y[0]

Writing model.py


In [37]:
%%writefile metadata
command: python3 $program/model.py $input $output

Writing metadata


zip the files to submit

In [38]:
!zip -r submission.zip model.py helper_func.py metadata XGB_model.json best_features.json

  adding: model.py (deflated 63%)
  adding: helper_func.py (deflated 67%)
  adding: metadata (stored 0%)
  adding: XGB_model.json (deflated 73%)
  adding: best_features.json (deflated 78%)


*You can use this notebook to save your file, download it, and submit it on CodaLab.

To download the zip file, use the file manager panel.
Use View > Table of contents to show the sidebar then click the Files tab. Right-click the file and select Download.

###Example - Prediction with the submitted model

In this section, we demonstrate how to predict with the submitted model  on Device_ID.

####Download and read one Device_ID for prediction

In [39]:
!gdown --fuzzy "https://drive.google.com/file/d/1v6ibfs73vzgb07c6YsAe44Mp067SYV_x/view?usp=sharing"

  soup = bs4.BeautifulSoup(line, features="html.parser")
Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1v6ibfs73vzgb07c6YsAe44Mp067SYV_x

but Gdown can't. Please check connections and permissions.


In [40]:
X = pd.read_csv('/content/demo.csv')

In [41]:
X.head()

Unnamed: 0,Device_ID,Datetime,URL,Domain_Name,Domain_cls1,Domain_cls2,Domain_cls3,Domain_cls4
0,69811,2023-04-23 01:23:57+03:00,1404944,2133977,368,0,0,0
1,69811,2023-04-23 01:23:57+03:00,408178,107342,332,0,0,0
2,69811,2023-04-23 01:23:57+03:00,173328,970134,669,0,0,0
3,69811,2023-04-23 01:23:57+03:00,270087,2368671,755,799,0,0
4,69811,2023-04-23 01:24:54+03:00,270087,2368671,755,799,0,0


####Create object model, load and predict

!unzip the submission files

In [42]:
!unzip -o '/content/submission.zip'

Archive:  /content/submission.zip
  inflating: model.py                
  inflating: helper_func.py          
 extracting: metadata                
  inflating: XGB_model.json          
  inflating: best_features.json      


Create model object, load and predict

In [43]:
from model import *
M = model()
M.load('')
Y_test=[]
unique_Device_IDs = list(set(X.Device_ID))
for id in unique_Device_IDs:
    X_test = X.loc[X['Device_ID'] == id]
    X_test.drop('Device_ID', axis=1, inplace=True)
    Y_test.append(M.predict(X_test))

print(f'Prediction: {Y_test[0]}')

Prediction: 0.006801425013691187
