In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
df = pd.read_csv("data.csv", index_col=0)

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df['y'].value_counts()

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['year'] = df.Timestamp.dt.year

In [None]:
df_year_count = df.groupby(['year', 'y'])['TOTUSJH'].count().reset_index()

In [None]:
df_year_count.columns = ['year', 'y','count']
df_year_count_except_0 = df_year_count[df_year_count['y']!= 0]

In [None]:
plt.figure()
ax = sns.lineplot(data=df_year_count_except_0, x='year', y='count', hue='y', palette="viridis")
ax.set_title("Number of solar flare X over the years")
plt.show()

In [None]:
df_year_count_4 = df_year_count[df_year_count['y']== 4]

In [None]:
plt.figure()
ax = sns.lineplot(data=df_year_count_4, x='year', y='count', hue='y', palette="viridis")
ax.set_title("Number of solar flare X over the years")
plt.show()

In [None]:
# sns.heatmap(df.corr()['y'])
plt.figure(figsize=(10, 10))
ax = sns.heatmap(df.corr()[['y']], vmin=-1, vmax=1, annot=True)
ax.set_title("Correlation plot of variable with y")

In [None]:
features = ['TOTUSJH', 'TOTBSQ', 'TOTUSJZ', 'USFLUX', 'TOTFZ', 'R_VALUE']

In [None]:
for feature in features:
    df_k = df[[feature, 'y']]
    df_k = df_k.reset_index()
    plt.figure()
    g = sns.FacetGrid(df_k, col='y')
    g.map(sns.kdeplot, feature)
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10), sharex=False, sharey=False)
axes = axes.ravel()
plt.figure()
for i, feature in enumerate(features):
    df_k = df[[feature, 'y']]
    df_k = df_k.reset_index()
    sns.boxplot(data=df_k, x="y", y=feature, ax=axes[i])
fig.suptitle("Values for each solar flares")
plt.show()

In [None]:
df['date_ordinal'] = pd.to_datetime(df['Timestamp']).apply(lambda date: date.toordinal())
plt.figure()
for feature in features:
    df_data_k = df[[feature, 'y', 'date_ordinal']]
    df_data_k = df_data_k.reset_index()
    g = sns.FacetGrid(df_data_k, col='y')
    g.map(sns.regplot, 'date_ordinal', feature, line_kws={"color": "red"})
plt.show()

In [None]:
color = ['#264653', '#2a9d8f', '#e9c46a', '#f4a261', '#e76f51']
# fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10), sharex=False, sharey=False)
# axes = axes.ravel()
for feature in features:
  fig, axs = plt.subplots(1, 5, figsize=(24, 4))
  for i in range(len(df.y.unique())):
    sns.lineplot(
      data=df[df.y==i],
      x='Timestamp',
      y=feature,
      ax=axs[i],
      color=color[i]
    )
fig.suptitle("Visualization of values for each variable in each timestamp")

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10), sharex=False, sharey=False)
axes = axes.ravel()
plt.figure()
for i, feature in enumerate(features):
    sns.regplot(
        data=df,
        x=feature,
        y='y',
        ax=axes[i],
        scatter_kws={"color": "blue"},
        line_kws={"color": "red"}
    )
fig.suptitle("Correlation of each variable with the solar flares")
plt.show()