In [None]:
import glob
import os
import pandas as pd 
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

**Step:1**
* load the data from train and test which consists of segment_id(i.e. ID code for the data segment. Matches the name of the associated data csv file contains the reding of the sensors)


In [None]:
#loading the train and test data
train=pd.read_csv("/kaggle/input/predict-volcanic-eruptions-ingv-oe/train.csv")
submission=pd.read_csv("/kaggle/input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv")

#segment id is for the csv files of the sensors
_1136037770=pd.read_csv("/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/1136037770.csv")
_1136037770.head()
_1136037770.shape

#getting the sum of every sensor reading in every csv
def col_trans(data):
    for dat in data.columns.values:
        data[f'{dat}_sum']=data[dat].sum()
    data=data.iloc[0:1,10:]
    return data


**Step 2:**
* load the csv files from the directory as per the segment_id and perform operations for test and train and creating a new DataFrame and merge it with the existing DataFrame on segment_id feature.

**Note:Uncomment and run the cell below for the first time (Making csv for the data merged)(may take sometime) Comment it after 1st run.**



In [None]:
#loading data into the df using glob
#creating a new df for loading the data sum and concatenating it with the previous csv 
train_df=pd.DataFrame()
for i in tqdm(train["segment_id"]):
    raw_path=f'/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/{i}.csv'
    temp_df=pd.read_csv(raw_path)
    temp_df=temp_df.fillna(0)
    temp_df["segment_id"]=i
    temp_df=col_trans(temp_df)
    train_df=pd.concat([train_df,temp_df],axis=0,ignore_index=True)
    
#dropping the column for sum of segment_ids
train_df.drop(["segment_id_sum"],axis=1,inplace=True)
train_df.shape


#loading data into the df using glob
#creating a new df for loading the data sum and concatenating it with the previous csv 
test_df=pd.DataFrame()
for i in tqdm(submission["segment_id"]):
    raw_path=f'/kaggle/input/predict-volcanic-eruptions-ingv-oe/test/{i}.csv'
    temp_df=pd.read_csv(raw_path)
    temp_df=temp_df.fillna(0)
    temp_df["segment_id"]=i
    temp_df=col_trans(temp_df)
    test_df=pd.concat([test_df,temp_df],axis=0,ignore_index=True)
    
#dropping the column for sum of segment_ids
test_df.drop(["segment_id_sum"],axis=1,inplace=True)
test_df.shape

#merging the train dataframe created with train on segment_id 
train=train.merge(train_df,on='segment_id')

#merging the test dataframe created with test on segment_id
submission=submission.merge(test_df,on='segment_id')

#writing the data to csv files so that we wont be going through this step again and again
train.to_csv("train_final.csv",index=False)
submission.to_csv("test_final.csv",index=False)



In [None]:
#loading the files written in csv by merging
train_df_final=pd.read_csv("./train_final.csv")
test_df_final=pd.read_csv("./test_final.csv")
train_df_final.shape,test_df_final.shape

**Step 3:**
* Split the data into train,validation and test for the modelling and train the model with train set and validate it on validation set to check the performance(mad and mse) and make predictions and make the submission file.  

In [None]:
#splitting the data into train and validation
train_split,val=train_test_split(train_df_final,test_size=.20,random_state=33)
print(f'train shape {train_split.shape} and validation shape {val.shape}')

#train_full
train_df_final_y=train_df_final.time_to_eruption
train_df_final_x=train_df_final.drop(["time_to_eruption","segment_id"],axis=1)

#train splitted
train_split_y=train_split["time_to_eruption"]
train_split_x=train_split.drop(["time_to_eruption","segment_id"],axis=1)

#validation
val_split_y=val.time_to_eruption
val_split_x=val.drop(["time_to_eruption","segment_id"],axis=1)

#test
test_split_x=test_df_final.drop(["time_to_eruption","segment_id"],axis=1)

#defining a Random forst regressor
rfr=RandomForestRegressor()
rfr.fit(train_split_x,train_split_y)
predicted=rfr.predict(val_split_x)

print(f'mean squared error is {mean_squared_error(predicted,val_split_y)}')
print(f'mean absolute error is {mean_absolute_error(predicted,val_split_y)}')
rfr.fit(train_df_final_x,train_df_final_y)
predicted=rfr.predict(test_split_x)
submission=pd.DataFrame({"segment_id":test_df_final["segment_id"],"time_to_eruption":predicted})
submission.to_csv("submission_rfr.csv",index=False)