# Approach:

1) Read a csv file given in "train" folder, take mean of all features and put it into a dataframe.
2) Add "time to erupt" feature from the train.csv file to the dataframe created in step # 1
3) repeat steps # 1 and 2 for all the csv files present in "train" folder

At the end of step3, we will have a dataframe which would contain data for all the segments, mean of recordings from all the censors. I saved this file and loaded back into "../input/volcano-eruption-data" and thats why you may find some code has been commented out to save execution time.

Next, I have built a naive model to predict the "time to erupt" for test data.

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#train = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/train.csv")
#train.head()

In [None]:
train_dir = "../input/predict-volcanic-eruptions-ingv-oe/train/"
test_dir = "../input/predict-volcanic-eruptions-ingv-oe/test/"    

### Helper function to read csv files present in the "train" folder

In [None]:
def read_csv(index):
    train1 = pd.read_csv(train_dir + str(train.segment_id.iloc[index]) + ".csv")

    train1['timetoerupt'] = train.time_to_eruption.iloc[index]
    
    for feat in train1.drop('timetoerupt',1).columns:
        train1[feat] = train1[feat].mean()
    
    train1 = train1.sample(1)
           
    return (train1)

### Read the files and create a dataframe

In [None]:
#data = pd.DataFrame()

#for idx in range(train.shape[0]):
#    df = read_csv(idx)
    
#    data=pd.concat([df,data])

I have already ran the above steps, created the dataframe, saved it into a csv file, and loaded it back for further use.
We will load the same file below.

In [None]:
# load training data
data = pd.read_csv("../input/volcano-eruption-data/data.csv")
data.head()

In [None]:
# this will confirm whether we have read all the files or not
data.shape

In [None]:
data.isnull().sum()

In [None]:
# replace null values with the mean value
for feat in data:
    data[feat] = data[feat].replace(np.nan, data[feat].mean())

In [None]:
data.isnull().sum()

# Model Building

In [None]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

import statsmodels.api as sm

from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, data.timetoerupt, test_size=0.2, random_state=42)

In [None]:
X_train.drop('timetoerupt',1,inplace = True)

# Add a constant to get an intercept
X_train_sm = sm.add_constant(X_train)

# train the model
lr = sm.OLS(y_train, X_train_sm).fit()

In [None]:
print(lr.summary())

In [None]:
X_test.drop('timetoerupt',1,inplace = True)

# Add a constant to get an intercept
X_test_sm = sm.add_constant(X_test)

# prediction on training dataset
y_test_pred = lr.predict(X_test_sm)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
r_squared = r2_score(y_test_pred, y_test)
r_squared

In [None]:
sub = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv")
sub.head()

### Helper function to read csv files in the test folder

In [None]:
def read_csv(index):
    
    test1 = pd.read_csv(test_dir + str(sub.segment_id.iloc[index]) + ".csv")

    for feat in test1.columns:
        test1[feat] = test1[feat].mean()
    
    test1 = test1.sample(1)
           
    return (test1)

In [None]:
#test = pd.DataFrame()

#for idx in range(sub.shape[0]):
#    df = read_csv(idx)
    
#    test = pd.concat([df,test])

In [None]:
# I have ran the steps mentioned above and saved the file, loading it now
test = pd.read_csv("../input/volcano-eruption-data/test.csv")
test.head()

In [None]:
# again verify whether all the files were read correctly or not
test.shape

In [None]:
test.isnull().sum()

In [None]:
# same as we did for the training data
for feat in test:
    test[feat] = test[feat].replace(np.nan, test[feat].mean())

In [None]:
#test.to_csv('test.csv',index=False)

In [None]:
# Add a constant to get an intercept
test_sm = sm.add_constant(test)

# prediction on test dataset
predictions = lr.predict(test_sm)

sub['time_to_eruption'] = predictions

In [None]:
sub.head()

In [None]:
# submission file
sub.to_csv('submission.csv',index=False)