In [2]:
from pathlib import Path
import pyarrow.parquet as pq
import gc
import pandas as pd
import numpy as np

In [3]:
%%time
ORIGIN = Path('../child-mind-institute-detect-sleep-states/')

df_train = pq.read_table(ORIGIN / 'train_series.parquet').to_pandas()
df_train_events = pd.read_csv(ORIGIN / 'train_events.csv')
# df_test = pq.read_table(ORIGIN / 'test_series.parquet').to_pandas()
# df_sample = pd.read_csv(ORIGIN / 'sample_submission.csv')

CPU times: user 50 s, sys: 33.3 s, total: 1min 23s
Wall time: 1min 23s


In [None]:
# df_train_events = pd.read_csv(ORIGIN / 'train_events.csv')

In [3]:
df_train_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14508 entries, 0 to 14507
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   series_id  14508 non-null  object 
 1   night      14508 non-null  int64  
 2   event      14508 non-null  object 
 3   step       9585 non-null   float64
 4   timestamp  9585 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 566.8+ KB


In [4]:
df_train.info()

NameError: name 'df_train' is not defined

In [None]:
df_train_events.isnull().sum()

In [None]:
df_train.isnull().sum()

In [5]:
df_train_events = df_train_events.dropna()
df_train_events.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9585 entries, 0 to 14505
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   series_id  9585 non-null   object 
 1   night      9585 non-null   int64  
 2   event      9585 non-null   object 
 3   step       9585 non-null   float64
 4   timestamp  9585 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 449.3+ KB


In [7]:
def reduce_memory_usage(data):
        "iterate through all the columns of a dataframe and modify the data type to reduce memory usage."
        start_mem = data.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
        for col in data.columns:
            col_type = data[col].dtype    
            if col_type != object:
                c_min = data[col].min()
                c_max = data[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        data[col] = data[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        data[col] = data[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        data[col] = data[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        data[col] = data[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        data[col] = data[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        data[col] = data[col].astype(np.float32)
                    else:
                        data[col] = data[col].astype(np.float64)
            else:
                if col.lower() == 'timestamp':
                    data[col] = pd.to_datetime(data[col])
                else:
                    data[col] = data[col].astype('category')

        end_mem = data.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        return data

In [8]:
df_train_events = reduce_memory_usage(df_train_events)

Memory usage of dataframe is 0.44 MB
Memory usage after optimization is: 0.23 MB
Decreased by 47.7%


In [9]:
df_train_events['timestamp_unix'] = df_train_events['timestamp'].apply(lambda x: int(x.timestamp()))

In [None]:
df_train_events.info()

In [None]:
df_train_events.head

In [None]:
import matplotlib.pyplot as plt

# Assuming df_train and df_train_events are already loaded and cleaned

# Select the first series ID from df_train
first_series_id = df_train['series_id'].iloc[0]
df_first_series = df_train[df_train['series_id'] == first_series_id]

# Plot enmo values over time
plt.figure(figsize=(12, 6))
plt.plot(df_first_series['timestamp'], df_first_series['enmo'], label='enmo', color='blue')

# Annotate awake and sleep time points from df_train_events
for index, row in df_train_events[df_train_events.index == first_series_id].iterrows():
    label = f"{row['event']} - {row['timestamp']}"
    plt.annotate(label, (row['timestamp'], 0), textcoords="offset points", xytext=(0, 10), ha='center', fontsize=8, color='red')

# Customize plot
plt.title(f'ENMO Time Series for Series ID: {first_series_id}')
plt.xlabel('Timestamp')
plt.ylabel('ENMO Values')
plt.legend()
plt.grid(True)
plt.show()


In [13]:
first_series_id = df_train['series_id'].iloc[0]
df_first_series = df_train[df_train['series_id'] == first_series_id]

NameError: name 'df_train' is not defined

In [None]:
df_first_series.info

In [None]:
df_train_events[df_train_events.series_id == first_series_id]

In [10]:
import matplotlib.pyplot as plt

def plot_enmo_single_day(series_id, target_date):
    # Convert 'timestamp' to datetime format
    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    
    # Filter data for the specified series_id and date
    df_single_day = df_train[df_train['series_id'] == series_id]

    # Select rows where the timestamp contains the target_date
    df_single_day = df_single_day[df_single_day['timestamp'].dt.date == pd.to_datetime(target_date).date()]

    # Plot 'enmo' values
    plt.figure(figsize=(10, 6))
    plt.plot(df_single_day['timestamp'], df_single_day['enmo'], label='enmo', color='blue')

    # Annotate awake and sleep time points from df_train_events
    df_events_series = df_train_events[df_train_events.index == series_id]
    for index, row in df_events_series.iterrows():
        if str(target_date) in str(row['timestamp']):
            label = f"{row['event']} - {row['timestamp']}"
            plt.annotate(label, (row['timestamp'], 0), textcoords="offset points", xytext=(0, 10), ha='center', fontsize=8, color='red')

    # Customize plot
    plt.title(f'ENMO Time Series for Series ID: {series_id} - Date: {target_date}')
    plt.xlabel('Timestamp')
    plt.ylabel('ENMO Values')
    plt.legend()
    plt.grid(True)
    plt.show()

# Example: Plotting for the first series_id and a specific date (replace with your own values)
plot_enmo_single_day(series_id=df_train['series_id'].iloc[0], target_date='2018-09-04')


NameError: name 'df_train' is not defined

In [12]:
df = df_first_series.copy()

NameError: name 'df_first_series' is not defined

In [None]:
df.info()

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])


In [None]:
df.info()

In [None]:
df.shape

In [None]:
df_single_day = df[df['timestamp'].dt.date == pd.to_datetime('2018-08-15').date()].copy()


In [None]:
df_single_day.head

In [11]:
plt.figure(figsize=(10, 6))
plt.plot(df_single_day['timestamp'], df_single_day['enmo'], label='enmo', color='blue')
plt.figure(figsize=(10, 6))
plt.plot(df_single_day['timestamp'], df_single_day['anglez'], label='anglez', color='magenta')

NameError: name 'df_single_day' is not defined

<Figure size 1000x600 with 0 Axes>

# subset

In [9]:
import random


In [7]:
series_list = list(df_train_events['series_id'].unique())

In [10]:
NBR_TRAINING_SERIES = 20
full_series = series_list
selected_series = random.sample(full_series, NBR_TRAINING_SERIES)

In [11]:
# selected_series_ids = df_train['series_id'].unique()[:20]

# Create a subset of df_train and corresponding rows from df_train_events
subset_df_train = df_train[df_train['series_id'].isin(selected_series)]
subset_df_train_events = df_train_events[df_train_events['series_id'].isin(selected_series)]


In [12]:
subset_df_train.to_csv(ORIGIN / 'subset_train_series.csv', index=False)
subset_df_train_events.to_csv(ORIGIN / 'subset_train_events.csv', index=False)