In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
train = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/train.csv")
sample_submission = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv")

In [None]:
train

In [None]:
import plotly.express as px

fig = px.histogram(
    train, 
    x="time_to_eruption",
    width=800,
    height=500,
    nbins=100,
    title='Время до извержения'
)

fig.show()

In [None]:
fig = px.line(
    train, 
    y="time_to_eruption",
    width=800,
    height=500,
    title='Время до извержения всех вулканов'
)

fig.show()

In [None]:
train['time_to_eruption'].describe()

In [None]:
print('Median:', train['time_to_eruption'].median())
print('Skew:', train['time_to_eruption'].skew())
print('Std:', train['time_to_eruption'].std())
print('Kurtosis:', train['time_to_eruption'].kurtosis())
print('Mean:', train['time_to_eruption'].mean())

In [None]:
sample_submission

In [None]:
check = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train/2037160701.csv')
check

In [None]:
import glob

train_frags = glob.glob("../input/predict-volcanic-eruptions-ingv-oe/train/*")
len(train_frags)

In [None]:
sensors = set()
observations = set()
nan_columns = list()
missed_groups = list()
for_df = list()

for item in train_frags:
    name = int(item.split('.')[-2].split('/')[-1])
    at_least_one_missed = 0
    frag = pd.read_csv(item)
    missed_group = list()
    missed_percents = list()
    for col in frag.columns:
        missed_percents.append(frag[col].isnull().sum() / len(frag))
        if pd.isnull(frag[col]).all() == True:
            at_least_one_missed = 1
            nan_columns.append(col)
            missed_group.append(col)
    if len(missed_group) > 0:
        missed_groups.append(missed_group)
    sensors.add(len(frag.columns))
    observations.add(len(frag))
    for_df.append([name, at_least_one_missed] + missed_percents)

In [None]:
print('Количество уникальных сенсоров: ', sensors)
print('Количество уникальных наблюдений: ', observations)
print('Количество пропущенных сенсоров:', len(nan_columns))

In [None]:
absent_sensors = dict()

for item in nan_columns:
    if item in absent_sensors:
        absent_sensors[item] += 1
    else:
        absent_sensors[item] = 0

absent_df = pd.DataFrame(absent_sensors.items(), columns=['Сенсор', 'Сколько раз пропущен'])

fig = px.bar(
    absent_df, 
    x='Сколько раз пропущен',
    y='Сенсор',
    width=800,
    height=600,
    title='Количество пропущенных сенсоров в train датасете'
)

fig.show()

In [None]:
absent_groups = dict()

for item in missed_groups:
    if str(item) in absent_groups:
        absent_groups[str(item)] += 1
    else:
        absent_groups[str(item)] = 0

absent_df = pd.DataFrame(absent_groups.items(), columns=['Группы', 'Пропущена'])
absent_df = absent_df.sort_values('Пропущена')

fig = px.bar(
    absent_df, 
    y="Группы",
    x='Пропущена',
    orientation='h',
    width=800,
    height=600,
    title='Количество пропущенных групп сенсоров в train датасете'
)

fig.show()

In [None]:
for_df = pd.DataFrame(
    for_df, 
    columns=[
        'segment_id', 'has_missed_sensors', 'missed_percent_sensor1', 
        'missed_percent_sensor2', 'missed_percent_sensor3', 'missed_percent_sensor4', 
        'missed_percent_sensor5', 'missed_percent_sensor6', 'missed_percent_sensor7', 
        'missed_percent_sensor8', 'missed_percent_sensor9', 'missed_percent_sensor10'
    ]
)

for_df

In [None]:
train = pd.merge(train, for_df)
train

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objs as go

fig = make_subplots(rows=1, cols=2)
traces = [
    go.Histogram(
        x=train[train['has_missed_sensors']==1]['time_to_eruption'], 
        nbinsx=200, 
        name='Есть пропущенные сенсоры'
    ),
    go.Histogram(
        x=train[train['has_missed_sensors']==0]['time_to_eruption'], 
        nbinsx=200, 
        name="Нет пропущенных сенсоров"
    )
]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2) + 1
    )

fig.update_layout(
    title_text='Время до извержения для частей с пропущенными сенсорами и без',
    height=600,
    width=1200
)
fig.show()

In [None]:
test_frags = glob.glob("../input/predict-volcanic-eruptions-ingv-oe/test/*")
len(test_frags)

In [None]:
sensors = set()
observations = set()
nan_columns = list()
missed_groups = list()
for_test_df = list()

for item in test_frags:
    name = int(item.split('.')[-2].split('/')[-1])
    at_least_one_missed = 0
    frag = pd.read_csv(item)
    missed_group = list()
    missed_percents = list()
    for col in frag.columns:
        missed_percents.append(frag[col].isnull().sum() / len(frag))
        if pd.isnull(frag[col]).all() == True:
            at_least_one_missed = 1
            nan_columns.append(col)
            missed_group.append(col)
    if len(missed_group) > 0:
        missed_groups.append(missed_group)
    sensors.add(len(frag.columns))
    observations.add(len(frag))
    for_test_df.append([name, at_least_one_missed] + missed_percents)

In [None]:
for_test_df = pd.DataFrame(
    for_test_df, 
    columns=[
        'segment_id', 'has_missed_sensors', 'missed_percent_sensor1', 'missed_percent_sensor2', 'missed_percent_sensor3', 
        'missed_percent_sensor4', 'missed_percent_sensor5', 'missed_percent_sensor6', 'missed_percent_sensor7', 
        'missed_percent_sensor8', 'missed_percent_sensor9', 'missed_percent_sensor10'
    ]
)

for_test_df

In [None]:
print('Количество уникальных сенсоров: ', sensors)
print('Количество уникальных наблюдений: ', observations)
print('Количество пропущенных сенсоров:', len(nan_columns))

In [None]:
absent_sensors = dict()

for item in nan_columns:
    if item in absent_sensors:
        absent_sensors[item] += 1
    else:
        absent_sensors[item] = 0

absent_df = pd.DataFrame(absent_sensors.items(), columns=['Сенсор', 'Сколько раз пропущен'])

fig = px.bar(
    absent_df, 
    x='Сколько раз пропущен',
    y='Сенсор',
    width=800,
    height=600,
    title='Количество пропущенных сенсоров в test датасете'
)

fig.show()

In [None]:
absent_groups = dict()

for item in missed_groups:
    if str(item) in absent_groups:
        absent_groups[str(item)] += 1
    else:
        absent_groups[str(item)] = 0

absent_df = pd.DataFrame(absent_groups.items(), columns=['Группы', 'Сколько раз пропущена'])
absent_df = absent_df.sort_values('Сколько раз пропущена')

fig = px.bar(
    absent_df, 
    y='Группы',
    x='Сколько раз пропущена',
    orientation='h',
    width=800,
    height=600,
    title='Количество пропущенных групп сенсоров в test датасете'
)

fig.show()

In [None]:
fig = make_subplots(rows=5, cols=2)
traces = [
    go.Histogram(
        x=check[col], 
        nbinsx=100, 
        name=col
    ) for col in check.columns
]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2) + 1
    )

fig.update_layout(
    title_text='Данные о распределении сенсоров',
    height=800,
    width=1200
)

fig.show()

In [None]:
fig = make_subplots(rows=5, cols=2)
traces = [
    go.Scatter(
        x=[i for i in range(60002)], 
        y=check[col], 
        mode='lines', 
        name=col
    ) for col in check.columns
]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2) + 1
    )

fig.update_layout(
    title_text='Данные о сенсорах',
    height=800,
    width=1200
)

fig.show()