In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

In [None]:
train = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/train.csv")
train.head()

In [None]:
fig = px.histogram(
    train, 
    x="time_to_eruption",
    width=800,
    height=500,
    nbins=100,
    title='Time to eruption distribution'
)

fig.show()

In [None]:
fig = px.line(
    train, 
    y="time_to_eruption",
    width=800,
    height=500,
    title='Time to eruption for all volcanos'
)

fig.show()

In [None]:
train['time_to_eruption'].describe()

In [None]:
train_dir = "../input/predict-volcanic-eruptions-ingv-oe/train/"
test_dir = "../input/predict-volcanic-eruptions-ingv-oe/test/" 

In [None]:
def read_csv(index):
    train1 = pd.read_csv(train_dir + str(train.segment_id.iloc[index]) + ".csv")

    train1['timetoerupt'] = train.time_to_eruption.iloc[index]
    
    for feat in train1.drop('timetoerupt',1).columns:
        train1[feat] = train1[feat].mean()
    
    train1 = train1.sample(1)
           
    return (train1)

In [None]:
data = pd.DataFrame()

for idx in range(train.shape[0]):
   df = read_csv(idx)
    
   data=pd.concat([df,data])

data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
for feat in data:
    data[feat] = data[feat].replace(np.nan, data[feat].mean())

In [None]:
data.isnull().sum()

# MODEL BUILDING

In [None]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

import statsmodels.api as sm

from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, data.timetoerupt, test_size=0.2, random_state=42)

In [None]:
X_train.drop('timetoerupt',1,inplace = True)

# Add a constant to get an intercept
X_train_sm = sm.add_constant(X_train)

# train the model
lr = sm.OLS(y_train, X_train_sm).fit()

In [None]:
X_test.drop('timetoerupt',1,inplace = True)

# Add a constant to get an intercept
X_test_sm = sm.add_constant(X_test)

# prediction on training dataset
y_test_pred = lr.predict(X_test_sm)

In [None]:
y_test_pred

In [None]:
fig = px.histogram(
    y_test_pred, 
    x="time_to_eruption",
    width=800,
    height=500,
    nbins=100,
    title='Time to eruption distribution'
)

fig.show()

In [None]:
from sklearn.metrics import mean_squared_error
import math

In [None]:
mae = np.mean(np.abs(y_test - y_test_pred))
print("Mean Absolute Error:", mae)

R2 = np.corrcoef(y_test,y_test_pred)[0,1]**2
print("R2 Score :", R2)

MSE = np.square(np.subtract(y_test,y_test_pred)).mean() 
RMSE = math.sqrt(MSE)
print("Root Mean Squared Error :", RMSE)