# Libraries

In [1]:
import gc
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

from colorama import Fore, Style, init
from pprint import pprint

# 🚫 Suppressing warnings 🚫
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import cohen_kappa_score, make_scorer, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [3]:
import os
from tqdm.auto import tqdm 
from concurrent.futures import ThreadPoolExecutor
from joblib import Parallel, delayed
from time import sleep, time
from multiprocessing import cpu_count
import polars as pl
from sklearn.preprocessing import MinMaxScaler
import concurrent.futures

from datetime import datetime, timezone, timedelta

# Load the data

In [4]:
df_test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
len(df_test)

20

In [5]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [6]:
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")
len(test_ts)

  0%|          | 0/2 [00:00<?, ?it/s]

2

In [7]:
test_ts['id'].unique()

array(['00115b9f', '001f3379'], dtype=object)

In [8]:
df = pd.read_csv('/kaggle/input/colombian-frenchteam-problematicinternetusage/Dataset_problematic_internet_usage.csv')
len(df)

3960

In [9]:
df.groupby('Train_Test_Label').size()

Train_Test_Label
isnotintrainset    2964
test                200
train               796
dtype: int64

In [10]:
df = df[(df['Train_Test_Label']=='test') | (df['Train_Test_Label']=='train') ]
len(df)

996

In [11]:
df[df['sii']==3].head(3)

Unnamed: 0.1,Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,...,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,Has_actigraphy_data,Train_Test_Label
894,894,35fa2ebf,Summer,13,0,Fall,70.0,Fall,20.404192,61.0,...,5.0,89.0,Fall,37.0,53.0,Summer,3.0,3.0,Yes,train
1502,1502,5e55e9fd,Spring,14,1,,,Spring,18.934295,60.75,...,5.0,81.0,Spring,44.0,62.0,Spring,3.0,3.0,Yes,train
2002,2002,7f44236f,Fall,13,0,Winter,70.0,Winter,25.370016,67.0,...,5.0,87.0,Winter,47.0,66.0,Fall,0.0,3.0,Yes,train


In [12]:
df[df['sii']==0].head(3)

Unnamed: 0.1,Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,...,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,Has_actigraphy_data,Train_Test_Label
22,22,01085eb3,Fall,12,0,Winter,58.0,Fall,34.187282,60.5,...,0.0,30.0,Fall,53.0,73.0,Fall,0.0,0.0,Yes,train
25,25,012cadd8,Spring,9,0,Winter,60.0,Fall,17.089151,53.2,...,0.0,9.0,Summer,27.0,40.0,Spring,0.0,0.0,Yes,train
26,26,012e3869,Summer,6,0,Winter,60.0,Summer,,,...,0.0,0.0,,,,Summer,0.0,0.0,Yes,train


# Plot functions 2

In [13]:
kid_id = '71ee31f8'

In [14]:
def feat_eng(df):

    df['time_of_day_hours'] = (df['time_of_day'] / 1e9 / 3600)  # nanoseconds to hours
    df['time_of_day_half_hours'] = (df['time_of_day'] / 1e9 / 1800)  # nanoseconds to half-hours
    df['time_of_day_half_half_hours'] = (df['time_of_day'] / 1e9 / 900)  # nanoseconds to 15 minutes interval
    df['time_of_day_fivemin_hours'] = (df['time_of_day'] / 1e9 / 300)  # nanoseconds to 5 minutes interval
    df['day_time'] = df['relative_date_PCIAT'] + (df['time_of_day_hours'] / 24)
    
    # Day period assignment
    day_start_hour = 8
    day_end_hour = 21
    df['day_period'] = np.where(
            (df['time_of_day_hours'] >= day_start_hour) &
            (df['time_of_day_hours'] < day_end_hour),
            'day', 'night'
        )
    
    # Initialize the 'which_day' column and day change detection
    df['which_day'] = 0
    day_change = (
            (df['weekday'] != df['weekday'].shift(1)) |
            (df['hour'] < df['hour'].shift(1)) |
            ((df['hour'] == df['hour'].shift(1)) & (df['minute'] < df['minute'].shift(1))) |
            ((df['hour'] == df['hour'].shift(1)) & (df['minute'] == df['minute'].shift(1)) & (df['second'] < df['second'].shift(1)))
        )
    df['which_day'] = day_change.cumsum() + 1
    df['day_period_b'] = np.where(df['day_period'] == 'day', 1, 0)
    df['time_of_day'] = pd.to_timedelta(df['time_of_day'], unit='ns')
    base_date = pd.to_datetime('2024-01-01')
    df['date'] = base_date + pd.to_timedelta(df['which_day'] - 1, unit='D')
    df['timestamp'] = df['date'] + df['time_of_day']
    df['timestamp'] = df['timestamp'].apply(lambda t: t.tz_localize(None))
    df['timestamp_2'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df.sort_values(['timestamp_2'], inplace=True)
    df.set_index('timestamp_2', inplace=True)
    
    df["anglez"] = df["anglez"].astype(np.float32)
    df["anglezdiffabs"] = df["anglez"].diff().abs().astype(np.float32)
        
    for col in ['anglezdiffabs']:
            
        # periods in seconds        
        periods = [60] 
            
        for n in periods:
                
            rol_args = {'window':f'{n+5}s', 'min_periods':10, 'center':True}
                
            for agg in ['median']:
                df[f'{col}_{agg}_{n}'] = df[col].rolling(**rol_args).agg(agg).astype(np.float32).values
                gc.collect()
                
            gc.collect()
    
    df.reset_index(inplace=True)
    df.dropna(inplace=True)
    df['large_enmo'] = df['enmo'] > 0.1509000062942505
    df['anglezdiffabs_median_60_norm'] = (df['anglezdiffabs_median_60'] - np.min(df['anglezdiffabs_median_60'])) / (max(df['anglezdiffabs_median_60']) - min(df['anglezdiffabs_median_60']))

    return df

In [15]:
def feat_eng_by_id(idx):
    
    from warnings import simplefilter 
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    
    df = (
        pl.scan_parquet(f'/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/id={idx}/part-0.parquet')
        .with_columns(
            (pl.col("time_of_day").cast(pl.Int64) / 1_000_000_000).alias("total_seconds")
        )
        .with_columns(
            [
                (pl.col("total_seconds") // 3600).alias("hour"),
                ((pl.col("total_seconds") % 3600) // 60).alias("minute"),
                (pl.col("total_seconds") % 60).alias("second"),
            ]
        )
        .collect()
        .to_pandas()
    )

    df = feat_eng(df)
    
    return df



In [16]:
from tqdm.auto import tqdm 
from joblib import Parallel, delayed
from time import sleep, time
from multiprocessing import cpu_count
import gc

In [17]:
from pathlib import Path
working_folder = Path("/kaggle/working/")
images_folder = working_folder/"imagesandannotations"
images_folder.mkdir()

window_properties_batch_file = os.path.join(images_folder, "window_properties.json")
all_events_batch_file = os.path.join(images_folder, "all_events.json")

In [18]:
series_ids = df['id'].unique()
len(series_ids)

996

In [19]:
series_ids = series_ids[760:950]
#series_ids = series_ids[0:2]
series_ids

array(['c3dde859', 'c446e1c5', 'c45068a1', 'c4c2076b', 'c5462e87',
       'c55187b4', 'c5cfaeae', 'c6313b18', 'c6575d38', 'c6cc7467',
       'c6d4b453', 'c6da0d4b', 'c6ead822', 'c731458f', 'c73dd2e4',
       'c73f149c', 'c79039af', 'c7d9e964', 'c8270746', 'c829500d',
       'c8cc2e1b', 'c8fbbc8a', 'c9600298', 'c96b9609', 'c9879365',
       'c9902f48', 'c9dabad5', 'c9e419e7', 'ca3204b0', 'ca33a5e7',
       'cadd4ae5', 'cb2752bc', 'cb3b7c3c', 'cb73fed9', 'cba39753',
       'cbcf8cf2', 'cd144127', 'cd1fff5a', 'cd39e576', 'cd68643b',
       'cd703872', 'cd89d5f1', 'cd8f3c61', 'cda90e9e', 'ce379a6a',
       'ce6eeadf', 'cefdb7fe', 'cf1b9d44', 'cfcf9dc9', 'cfe4cf40',
       'd05f5d71', 'd08806a2', 'd0ac4f1c', 'd19e1025', 'd28f0d44',
       'd2d9b474', 'd3da09e8', 'd445fb74', 'd4d2f272', 'd4d9f7f9',
       'd506f4c7', 'd57f6c9e', 'd5a8d9b0', 'd5b089a4', 'd5fde276',
       'd6251195', 'd661ddf7', 'd6776176', 'd6cca65e', 'd74e4d7c',
       'd77ac332', 'd791703f', 'd8037389', 'd859115e', 'd87e2d

In [20]:
import json
import gc
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
import numpy as np  # Ensure np is imported for handling int64 types

def process_and_plot(series_ids, images_folder):
    
    # Initialize main lists to hold all data
    window_properties = []
    all_events = []
    
    # Batch tracking lists
    batch_size = 50  # Adjust batch size to your memory constraints
    batch_window_properties = []
    batch_all_events = []
    
    # Files for saving batches of data
    window_properties_batch_file = images_folder / "window_properties.json"
    all_events_batch_file = images_folder / "all_events.json"

    # Create files if they don't exist
    window_properties_batch_file.touch(exist_ok=True)
    all_events_batch_file.touch(exist_ok=True)

    # Process the series_ids
    for idx in tqdm(series_ids):
        
        filtered_data = feat_eng_by_id(idx)

        series = filtered_data.reset_index(drop=True)
        series['color'] = ["blue" if large_enmo else "green" for large_enmo in series['large_enmo']]
        series['timestamp'] = pd.to_datetime(series['timestamp'])
        series['timestamp'] = series['timestamp'].apply(lambda x: x if x.tzinfo is not None else x.tz_localize('UTC'))
        series['timestamp_utc'] = series['timestamp'].map(lambda timestamp: timestamp.astimezone(timezone.utc))
        series['anglez_radians'] = (np.pi / 180) * series['anglez']
        series['cos_anglez'] = np.cos(series['anglez_radians'])
        series['enmo'] = np.clip(series['enmo'], 0, 1)
        min_date_utc = series['timestamp_utc'].dt.date.min()
        max_date_utc = series['timestamp_utc'].dt.date.max()
        
        series_24_hour_windows = {}
        upper_bound = datetime(year=min_date_utc.year, month=min_date_utc.month, day=min_date_utc.day, hour=20, minute=30, tzinfo=timezone.utc)
        lower_bound = upper_bound + timedelta(hours=-24) # 8:30pm UTC on the previous day.
        while lower_bound < series['timestamp_utc'].max():
            window_df = series.loc[(series['timestamp_utc'] >= lower_bound) & (series['timestamp_utc'] < upper_bound)].reset_index(drop=True)
            if len(window_df) > 0:
                series_24_hour_windows[upper_bound.isoformat()[:-6]] = window_df
            upper_bound += timedelta(hours=24)
            lower_bound += timedelta(hours=24)
        
        windows = list(series_24_hour_windows.keys())
        num_steps_cumulative = 0
        
        for window_idx, window in enumerate(windows):
            
             if (series_24_hour_windows[window]['non-wear_flag'].mean()<0.5) & (len(series_24_hour_windows[window]) == 17280): 

                day = series_24_hour_windows[window]['which_day'].iloc[0]
                 
                fig = plt.figure(figsize=(14.4, 4))  # (width, height) in inches
                #plt.plot(series_24_hour_windows[window]['timestamp_utc'], series_24_hour_windows[window]['cos_anglez'], color="red")
                plt.plot(series_24_hour_windows[window]['timestamp_utc'],
                             series_24_hour_windows[window]['anglezdiffabs_median_60_norm'],
                             color="red")
                plt.scatter(
                        series_24_hour_windows[window]['timestamp_utc'], 
                        series_24_hour_windows[window]['enmo'], 
                        color=series_24_hour_windows[window]['color'], 
                        s=1
                    )
                plt.scatter(series_24_hour_windows[window]['timestamp_utc'], series_24_hour_windows[window]['non-wear_flag'], label='non_wear_flag', color='red', alpha=0.7, s=10)
        
                plt.fill_between(series_24_hour_windows[window]['timestamp_utc'],
                                 0, max(1,series_24_hour_windows[window]['anglezdiffabs_median_60_norm'].max()), 
                                 where=(series_24_hour_windows[window]['non-wear_flag'] == 1), 
                                 color='red', alpha=0.1, label='Day Period')
                 
                plt.fill_between(series_24_hour_windows[window]['timestamp_utc'],
                                 0, max(1,series_24_hour_windows[window]['anglezdiffabs_median_60_norm'].max()), 
                                 where=(series_24_hour_windows[window]['day_period_b'] == 1), 
                                 color='blue', alpha=0.1, label='Day Period')
                ax = plt.gca()
                ax.spines['top'].set_visible(False)
                ax.spines['right'].set_visible(False)
                ax.spines['bottom'].set_visible(False)
                ax.spines['left'].set_visible(False)
                ax.set_xticks([])
                ax.set_yticks([])
                plt.margins(0, 0)
                plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
        
                # Save image to images_folder
                image_path = images_folder / f"{idx}_{day}.jpg"
                plt.savefig(image_path, dpi=150, bbox_inches="tight", pad_inches=0)
                plt.close(fig)

                # Store data for batch, safely converting where necessary
                window_properties_item = {
                        'series_id': str(idx),  # Ensure series_id is stored as string
                        'image_name': f"{idx}_{day}.jpg", 
                        'idx_in_series': int(day)  # Convert 'day' to int if it's a number
                    }
                window_properties.append(window_properties_item)
                batch_window_properties.append(window_properties_item)  # Add to batch

                # Add to all_events batch
                sii_label = df.loc[df['id'] == idx, 'sii'].values.item()

                # Ensure sii_label is numeric before converting to int
                if isinstance(sii_label, (int, np.integer)):
                    sii_label = int(sii_label)
                else:
                    sii_label = str(sii_label)  # If it's a string, store it as string

                all_events_item = {'series_id': str(idx), 'image_name': f"{idx}_{day}.jpg", 'label': sii_label}
                all_events.append(all_events_item)
                batch_all_events.append(all_events_item)  # Add to batch

                # Save in batches when batch size is reached
                if len(batch_window_properties) >= batch_size:
                    # Save window_properties batch to JSON file
                    with open(window_properties_batch_file, "a") as f:
                        json.dump(batch_window_properties, f, indent=4)
                    batch_window_properties.clear()  # Clear the batch

                    # Save all_events batch to JSON file
                    with open(all_events_batch_file, "a") as f:
                        json.dump(batch_all_events, f, indent=4)
                    batch_all_events.clear()  # Clear the batch

                gc.collect()  # Clean up memory after each batch

    # After the loop, save any remaining data if the final batch wasn't full
    if batch_window_properties:
        with open(window_properties_batch_file, "a") as f:
            json.dump(batch_window_properties, f, indent=4)
    if batch_all_events:
        with open(all_events_batch_file, "a") as f:
            json.dump(batch_all_events, f, indent=4)



In [21]:
process_and_plot(series_ids,images_folder)

100%|██████████| 190/190 [3:16:56<00:00, 62.19s/it] 


In [22]:
import glob

In [23]:
def load_all_data_from_batches(directory_path, file_prefix):
    all_data = []
    
    # Get all filenames matching the prefix and pattern
    batch_files = glob.glob(f"{directory_path}/{file_prefix}*.json")
    
    for batch_file in batch_files:
        with open(batch_file, 'r') as f:
            try:
                data = json.load(f)
                all_data.extend(data)  # Append data from this batch to the overall list
            except json.JSONDecodeError as e:
                print(f"Error loading {batch_file}: {e}")
    
    return all_data
    

In [24]:
# Path to your JSON files
window_properties_path = '/kaggle/working/imagesandannotations/window_properties.json'
all_events_path = '/kaggle/working/imagesandannotations/all_events.json'

# Function to load and fix the JSON files if needed
def load_json_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    # Remove any invalid '][' and ensure the content is wrapped in square brackets
    fixed_content = content.replace("][", ",")  # Fix '][' by replacing with a comma

    # Wrap content in square brackets if it's not already
    if not fixed_content.startswith('['):
        fixed_content = f"[{fixed_content}"
    if not fixed_content.endswith(']'):
        fixed_content = f"{fixed_content}]"

    # Try to load the corrected content
    try:
        data = json.loads(fixed_content)
        print(f"Successfully loaded {file_path}!")
        return data
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in {file_path}: {e}")
        return None

# Load both files
window_properties_data = load_json_file(window_properties_path)
all_events_data = load_json_file(all_events_path)

# Print a sample from each to confirm the data is loaded correctly
print("Sample from window_properties:")
print(window_properties_data[:3])  # Show the first 3 items as a sample

print("\nSample from all_events:")
print(all_events_data[:3])  # Show the first 3 items as a sample

Successfully loaded /kaggle/working/imagesandannotations/window_properties.json!
Successfully loaded /kaggle/working/imagesandannotations/all_events.json!
Sample from window_properties:
[{'series_id': 'c3dde859', 'image_name': 'c3dde859_2.jpg', 'idx_in_series': 2}, {'series_id': 'c3dde859', 'image_name': 'c3dde859_3.jpg', 'idx_in_series': 3}, {'series_id': 'c3dde859', 'image_name': 'c3dde859_4.jpg', 'idx_in_series': 4}]

Sample from all_events:
[{'series_id': 'c3dde859', 'image_name': 'c3dde859_2.jpg', 'label': '0.0'}, {'series_id': 'c3dde859', 'image_name': 'c3dde859_3.jpg', 'label': '0.0'}, {'series_id': 'c3dde859', 'image_name': 'c3dde859_4.jpg', 'label': '0.0'}]


In [25]:
window_properties_df = pd.DataFrame(window_properties_data)
window_properties_df.head(5)

Unnamed: 0,series_id,image_name,idx_in_series
0,c3dde859,c3dde859_2.jpg,2
1,c3dde859,c3dde859_3.jpg,3
2,c3dde859,c3dde859_4.jpg,4
3,c3dde859,c3dde859_5.jpg,5
4,c3dde859,c3dde859_6.jpg,6


In [26]:
annotations_df = pd.DataFrame(all_events_data)
annotations_df.head(5)

Unnamed: 0,series_id,image_name,label
0,c3dde859,c3dde859_2.jpg,0.0
1,c3dde859,c3dde859_3.jpg,0.0
2,c3dde859,c3dde859_4.jpg,0.0
3,c3dde859,c3dde859_5.jpg,0.0
4,c3dde859,c3dde859_6.jpg,0.0
