# Libraries**

In [1]:
import gc
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

from colorama import Fore, Style, init
from pprint import pprint

# 🚫 Suppressing warnings 🚫
import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.dates as mdates
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

In [3]:
from scipy.stats import entropy
from collections import Counter
import polars as pl

In [4]:
from joblib import Parallel, delayed
from time import sleep, time
from multiprocessing import cpu_count

In [5]:
from tqdm.auto import tqdm 
from concurrent.futures import ThreadPoolExecutor

In [6]:
import os

# test data 

In [7]:
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
len(test)

20

# Reading the train tabular data

In [8]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
len(train)

3960

In [9]:
# Group by 'sii' and count occurrences, including NaN as a category
sii_counts = train['sii'].value_counts(dropna=False)

# Display the counts
print(sii_counts)

sii
0.0    1594
NaN    1224
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64


In [10]:
train[(train['sii']==3)].head(5)

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
191,0ac521c8,Spring,14,0,Fall,68.0,,,,,...,5.0,5.0,5.0,81.0,Summer,50.0,69.0,Spring,2.0,3.0
255,0e951124,Summer,14,0,,,Summer,20.371155,61.5,109.6,...,5.0,4.0,3.0,81.0,Summer,35.0,50.0,Summer,3.0,3.0
306,11f7253c,Fall,16,0,,,Fall,46.102914,67.5,298.8,...,5.0,5.0,5.0,89.0,Fall,67.0,91.0,Fall,3.0,3.0
370,157b271f,Spring,15,1,Summer,48.0,Spring,20.900479,63.0,118.0,...,5.0,5.0,5.0,92.0,Spring,53.0,73.0,Spring,3.0,3.0
405,1824417f,Summer,11,0,Spring,60.0,Winter,15.166845,61.5,81.6,...,5.0,4.0,4.0,80.0,Spring,60.0,82.0,Summer,3.0,3.0


# Reading the train actigraphy data (just loading the mean values of each column instead of all the rows for each id)

In [11]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [12]:
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
len(train_ts)

  0%|          | 0/996 [00:00<?, ?it/s]

996

In [13]:
train_ts.head(5)

Unnamed: 0,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,...,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95,id
0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,...,5.314874,89.422226,0.0,2626.199951,4187.0,86395000000000.0,7.0,2.0,57.0,0745c390
1,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,...,3.966906,89.08033,1.0,2628.199951,4146.0,86395000000000.0,7.0,2.0,243.0,eaab7a96
2,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,...,5.066334,86.987267,0.0,2618.199951,4183.0,86365000000000.0,7.0,3.0,134.0,8ec2cc63
3,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,...,6.134459,89.976074,0.0,2502.0,6000.0,86395000000000.0,7.0,4.0,72.0,b2987a65
4,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,...,2.774382,89.300034,0.0,1046.800049,4199.0,86015000000000.0,7.0,4.0,76.0,7b8842c3


# Counts based on the the different sets 
* How many data we have to train?
* How many ids have both the actigraphy data and the tabular data?
* Potentially define our train and test sets size, and a seed for both

In [14]:
print("the ids we have in the training set ",len(train),"the ids we have with actigraphy data ",len(train_ts)," a percentage of ",round(len(train_ts)/len(train),2))

the ids we have in the training set  3960 the ids we have with actigraphy data  996  a percentage of  0.25


In [15]:
merged_data = train.merge(train_ts[['id']], on='id', how='inner')
len(merged_data)

996

In [16]:
# Group by 'sii' and count occurrences, including NaN as a category
sii_counts = merged_data['sii'].value_counts(dropna=False)

# Display the counts
print(sii_counts)

sii
0.0    583
1.0    266
2.0    137
3.0     10
Name: count, dtype: int64


# Creates a column to identify if it has or not actigraphy data
# saves the data in a new set, so that each time we dont have to load it.

In [17]:
train_ts['Has_actigraphy_data']='Yes'
merged_data = train.merge(train_ts[['id','Has_actigraphy_data']], on='id', how='left') #left join instead of inner
len(merged_data)

3960

In [18]:
merged_data['Has_actigraphy_data'].fillna('No', inplace=True)
merged_data.groupby('Has_actigraphy_data').size()

Has_actigraphy_data
No     2964
Yes     996
dtype: int64

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
train_data, test_data = train_test_split(
    merged_data[merged_data['Has_actigraphy_data'] == 'Yes'],
    test_size=0.2, 
    random_state=1234
)

merged_data['Train_Test_Label'] = 'isnotintrainset'

merged_data.loc[merged_data['Has_actigraphy_data'] == 'Yes', 'Train_Test_Label'] = \
    merged_data.loc[merged_data['Has_actigraphy_data'] == 'Yes',].apply(
        lambda row: 'train' if row.name in train_data.index else 'test', axis=1)

In [21]:
merged_data.groupby('Train_Test_Label').size()

Train_Test_Label
isnotintrainset    2964
test                200
train               796
dtype: int64

In [22]:
print(f"Training data size: {len(train_data)}")
print(f"Testing data size: {len(test_data)}")

Training data size: 796
Testing data size: 200


In [23]:
merged_data.to_csv('Dataset_problematic_internet_usage.csv')