In [None]:
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm

import datetime

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# 1. 데이터확인

## 파일수 확인

In [None]:
train_csvs = glob.glob('../input/predict-volcanic-eruptions-ingv-oe/train/*')
len(train_csvs)

In [None]:
test_csvs = glob.glob("../input/predict-volcanic-eruptions-ingv-oe/test/*")
len(test_csvs)

## Train.csv확인

In [None]:
train = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train.csv')
train

## Sample Submission 확인

In [None]:
sample_submission = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')
sample_submission

## Train segment_id확인

In [None]:
train_csvs[0]

In [None]:
sequence = pd.read_csv(train_csvs[0])
sequence

In [None]:
sequence.describe()

## Sensor 값 확인

In [None]:
def show_sensors(df):
    f,axes = plt.subplots(10,1)
    f.set_size_inches((16,10))
    f.tight_layout()
    plt.subplots_adjust(bottom=-0.4)
    
    for i in  range (1,11):
        axes[i-1].plot(df[f'sensor_{i}'].values)
        axes[i-1].set_title('Sensor_'+str(i))
        axes[i-1].set_xlabel('time')

In [None]:
show_sensors(sequence)

In [None]:
show_sensors(sequence.fillna(0))

# 2.데이터분석, EDA
## Train.csv

In [None]:
train['time_to_eruption'].apply(lambda x:datetime.timedelta(seconds = x/100))
# 시간단위 변환

In [None]:
train['hhmmss'] = train['time_to_eruption'].apply(lambda x:datetime.timedelta(seconds = x/100))
train

In [None]:
(train['hhmmss']/pd.Timedelta(hours=1)).hist()

In [None]:
fig = px.histogram(
      train,
      x='time_to_eruption',
      width=800, height=600, nbins=10, title='Time To Eruption Distribution')
fig.show()

In [None]:
fig = px.line(
      train,
      y='time_to_eruption',
      width=800, height=500,  title='Time To Eruption Distribution')
fig.show()

## Train,Test Set 분석

### Train 셋

In [None]:
sensors = set()
observations = set()
nan_columns = list()
missed_groups = list()
for_df = list()

for item in train_csvs:
    name = int(item.split('.')[-2].split('/')[-1]) #train 폴더 csv 파일명 이름만 갖고오
    at_least_one_missed = 0
    frag = pd.read_csv(item)
    missed_group = list()
    missed_percents = list()
    for col in frag.columns:
        missed_percents.append(frag[col].isnull().sum() / len(frag))
        if pd.isnull(frag[col]).all() == True:
            at_least_one_missed = 1
            nan_columns.append(col)
            missed_group.append(col)
    if len(missed_group) > 0:
        missed_groups.append(missed_group)
    sensors.add(len(frag.columns))
    observations.add(len(frag))
    for_df.append([name, at_least_one_missed] + missed_percents)
    
print('고유한(Unique) 센서 수: ', sensors)
print('고유한(Unique) 행(row) 수: ', observations)

In [None]:
print('센서값 Null인 센서수:',len(nan_columns))

absent_sensors = dict()

for item in nan_columns:
    if item in absent_sensors:
        absent_sensors[item]+=1
    else:
        absent_sensors[item]=0

In [None]:
absent_df = pd.DataFrame(absent_sensors.items(),columns=['Sensor','Missed sensors'])

fig = px.bar(absent_df,
            x='Sensor',
            y='Missed sensors',
            width=800,
            height=500,
            title='Number of missed sensors in training dataset')

fig.show()

## Test Set

In [None]:
sensors = set()
observations = set()
nan_columns = list()
missed_groups = list()
for_test_df = list()

for item in test_csvs:
    name = int(item.split('.')[-2].split('/')[-1])
    at_least_one_missed = 0
    frag = pd.read_csv(item)
    missed_group = list()
    missed_percents = list()
    for col in frag.columns:
        missed_percents.append(frag[col].isnull().sum() / len(frag))
        if pd.isnull(frag[col]).all() == True:
            at_least_one_missed = 1
            nan_columns.append(col)
            missed_group.append(col)
    if len(missed_group) > 0:
        missed_groups.append(missed_group)
    sensors.add(len(frag.columns))
    observations.add(len(frag))
    for_test_df.append([name, at_least_one_missed] + missed_percents)
    
print('고유한(Unique) 센서 수: ', sensors)
print('고유한(Unique) 행(row) 수: ', observations)

In [None]:
print('값이 비어있는(missed) 센서 수:', len(nan_columns))

absent_sensors = dict()

for item in nan_columns:
    if item in absent_sensors:
        absent_sensors[item] += 1
    else:
        absent_sensors[item] = 0

In [None]:
absent_df = pd.DataFrame(absent_sensors.items(), columns=['Sensor', 'Missed sensors'])

fig = px.bar(
    absent_df, 
    x="Sensor",
    y='Missed sensors',
    width=800,
    height=500,
    title='Number of missed sensors in test dataset'
)

fig.show()