# Importing lib

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

#  Loading & Preparing data

In [None]:
train = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train.csv')


In [None]:
train.info()

In [None]:
train

In [None]:
data = train.copy()

In [None]:
# making segments data into one data frame
def make_data():
    global data
    n = 0
    for  i in data.segment_id :
        file = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train/{}.csv'.format(i))
        for x in file.columns:
            data.loc[n:n+1,x+'_mean'] = file[x].mean()
            data.loc[n,x+'_std'] = file[x].std()
            data.loc[n,x+'_min'] = file[x].min()
            data.loc[n,x+'_20'] = file[x].quantile(0.20)
            data.loc[n,x+'_40'] = file[x].quantile(0.40)
            data.loc[n,x+'_50'] = file[x].quantile(0.50)
            data.loc[n,x+'_60'] = file[x].quantile(0.60)
            data.loc[n,x+'_80'] = file[x].quantile(0.80)
            data.loc[n,x+'_max'] = file[x].max()
        n +=1

    data.fillna(0,inplace=True)
    return data
make_data()    

In [None]:
data_train = data.copy()

In [None]:
data_train.info()

In [None]:
data_train.describe()

# Data Preparation

In [None]:
y_train = data_train['time_to_eruption']
segment_id = data_train['segment_id']
data_train.drop(['time_to_eruption','segment_id'],axis = 1 , inplace=True)

In [None]:
# finding and removing constant columns
cols = []
for i in data_train.columns :
    if data_train[i].min() == data_train[i].max():
        cols.append(i)
cols

In [None]:
data_train.drop(['sensor_1_50', 'sensor_2_50', 'sensor_3_50','sensor_4_50', 'sensor_5_50', 'sensor_6_50',
                 'sensor_7_50', 'sensor_8_50', 'sensor_9_50', 'sensor_10_50'],axis = 1 , inplace=True)

In [None]:
data_train.shape

In [None]:
std_scaler = StandardScaler()
std_data = std_scaler.fit_transform(data_train)
data_train = pd.DataFrame(std_data, columns = data_train.columns)
data_train

# Training Models

### 1. LinearRegression

In [None]:
reg_model = LinearRegression()
reg_model.fit(data_train,y_train)
print(reg_model.score(data_train,y_train))


### 2. RandomForestRegressor

In [None]:
forest_model = RandomForestRegressor()
forest_model.fit(data_train,y_train)
print(forest_model.score(data_train,y_train))


In [None]:
forest_score = cross_val_score(forest_model,data_train,y_train,scoring= 'neg_mean_squared_error', cv=20)
forest_rmse_score = np.sqrt(-forest_score)
print(forest_rmse_score.mean())



### 3. DecisionTreeRegressor

In [None]:
tree_model = DecisionTreeRegressor()
tree_model.fit(data_train,y_train)
print(tree_model.score(data_train,y_train))


In [None]:
tree_score = cross_val_score(tree_model,data_train,y_train,scoring= 'neg_mean_squared_error', cv=20)
tree_rmse_score = np.sqrt(-tree_score)
print(tree_rmse_score.mean())



### 4. SVR

In [None]:
svr_model = SVR()
svr_model.fit(data_train,y_train)
print(svr_model.score(data_train,y_train))


### 5. GradientBoostingRegressor

In [None]:
gbr_model = GradientBoostingRegressor()
gbr_model.fit(data_train,y_train)
print(gbr_model.score(data_train,y_train))

In [None]:
gbr_score = cross_val_score(gbr_model,data_train,y_train,scoring= 'neg_mean_squared_error', cv=20)
gbr_rmse_score = np.sqrt(-gbr_score)
print(gbr_rmse_score.mean())


### 6. xgb

In [None]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(data_train,y_train)
print(xgb_model.score(data_train,y_train))

In [None]:
xgb_score = cross_val_score(xgb_model,data_train,y_train,scoring= 'neg_mean_squared_error', cv=20)
xgb_rmse_score = np.sqrt(-xgb_score)
print(xgb_rmse_score.mean())


### 7. lgb

In [None]:
lgb_model = lgb.LGBMRegressor()
lgb_model.fit(data_train,y_train)
print(lgb_model.score(data_train,y_train))

In [None]:
lgb_score = cross_val_score(lgb_model,data_train,y_train,scoring= 'neg_mean_squared_error', cv=20)
lgb_rmse_score = np.sqrt(-lgb_score)
print(lgb_rmse_score.mean())


# FINE TUNE THE MODELS


### 1. RandomForestRegressor

In [None]:
forest_tune_pipeline = Pipeline([
     ('selector',SelectKBest(f_regression)),
     ('model',RandomForestRegressor(random_state = 42))])

forest_grid_search = GridSearchCV( estimator = forest_tune_pipeline, param_grid = {'selector__k':[70] , 
  'model__n_estimators':np.arange(500,550,50),'model__max_depth':[15]}, n_jobs=-1,
                                  scoring="neg_mean_squared_error", cv=20, verbose=3)

In [None]:
forest_grid_search.fit(data_train,y_train)
print('the best parameters : ',forest_grid_search.best_params_)
print('the best score = ', np.sqrt(-forest_grid_search.best_score_))

### 2. DecisionTreeRegressor

In [None]:
tree_tune_pipeline = Pipeline([
     ('selector',SelectKBest(f_regression)),
     ('model',DecisionTreeRegressor(random_state = 42))])

tree_grid_search = GridSearchCV( estimator = tree_tune_pipeline, param_grid = {'selector__k':[70,80] , 
     'model__max_depth':[12,13,15]}, n_jobs=-1,scoring="neg_mean_squared_error", cv=20, verbose=3)

In [None]:
tree_grid_search.fit(data_train,y_train)
print('the best parameters : ',tree_grid_search.best_params_)
print('the best score = ', np.sqrt(-tree_grid_search.best_score_))

### 3. xgb

In [None]:
xgb_tune_pipeline = Pipeline([
     ('selector',SelectKBest(f_regression)), ('model',xgb.XGBRegressor(random_state=42))])

xgb_grid_search = GridSearchCV( estimator = xgb_tune_pipeline, param_grid = {'selector__k':[70] , 
  'model__learning_rate':[0.05],'model__n_estimators':[3000,5000],'model__max_depth':[10],'model__colsample_bytree':[0.3]},
                               n_jobs=-1, scoring="neg_mean_squared_error", cv=20, verbose=3)

In [None]:
xgb_grid_search.fit(data_train,y_train)
print('the best parameters : ',xgb_grid_search.best_params_)
print('the best score = ', np.sqrt(-xgb_grid_search.best_score_))

### 4. lgb

In [None]:
lgb_tune_pipeline = Pipeline([
     ('selector',SelectKBest(f_regression)),
     ('model',lgb.LGBMRegressor(random_state=42,objective='regression',
                              bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11))])

lgb_grid_search = GridSearchCV( estimator = lgb_tune_pipeline, param_grid = {'selector__k':[70] , 
  'model__learning_rate':[0.01],'model__num_iterations':[10000],'model__n_estimators':[500],
                            'model__max_bin':[100],'model__num_leaves':[40,50]},
                               n_jobs=-1, scoring="neg_mean_squared_error", cv=20, verbose=3)

In [None]:
lgb_grid_search.fit(data_train,y_train)
print('the best parameters : ',lgb_grid_search.best_params_)
print('the best score = ', np.sqrt(-lgb_grid_search.best_score_))



### Summary
the most algorithm that worked well is **lgb** then **xgb** then **RandomForestRegressor** then **DecisionTreeRegressor**

# Final Model for Test set



### Working on Test data processing


In [None]:
test = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')
test

In [None]:
def make_data():
    global test
    n = 0
    for  i in test.segment_id :
        file = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/test/{}.csv'.format(i))
        for x in file.columns:
            test.loc[n:n+1,x+'_mean'] = file[x].mean()
            test.loc[n,x+'_std'] = file[x].std()
            test.loc[n,x+'_min'] = file[x].min()
            test.loc[n,x+'_20'] = file[x].quantile(0.20)
            test.loc[n,x+'_40'] = file[x].quantile(0.40)
            test.loc[n,x+'_50'] = file[x].quantile(0.50)
            test.loc[n,x+'_60'] = file[x].quantile(0.60)
            test.loc[n,x+'_80'] = file[x].quantile(0.80)
            test.loc[n,x+'_max'] = file[x].max()
        n +=1

    test.fillna(0,inplace=True)
    return test
make_data()    

In [None]:
segment_id_test = test['segment_id']
test.drop(['time_to_eruption','segment_id'],axis = 1 , inplace=True)

In [None]:
test.drop(['sensor_1_50', 'sensor_2_50', 'sensor_3_50','sensor_4_50', 'sensor_5_50', 'sensor_6_50',
                 'sensor_7_50', 'sensor_8_50', 'sensor_9_50', 'sensor_10_50'],axis = 1 , inplace=True)

In [None]:
std_test = std_scaler.transform(test)
test = pd.DataFrame(std_test, columns = test.columns)
test

### Final Model 

### We will try *lgb* alone and ensemble *[lgb+xgb]* finally we also will try ensemble *[lgb+xgb+RandomForestRegressor]*

In [None]:
lgb_final_model    = lgb_grid_search.best_estimator_
xgb_final_model    = xgb_grid_search.best_estimator_
forest_final_model = forest_grid_search.best_estimator_


In [None]:
lgb_y_pred    = lgb_final_model.predict(test)
xgb_y_pred    = xgb_final_model.predict(test)
forest_y_pred = forest_final_model.predict(test)

### Submission

In [None]:
sub = pd.DataFrame()
sub['segment_id'] = segment_id_test
sub['time_to_eruption'] = lgb_y_pred
sub.to_csv('submission_lgb.csv',index=False)

In [None]:
sub_ensemble = pd.DataFrame()
sub_ensemble['segment_id'] = segment_id_test
sub_ensemble['time_to_eruption'] = lgb_y_pred*0.6 + xgb_y_pred*0.4
sub_ensemble.to_csv('submission_ensemble_1.csv',index=False)

In [None]:
sub_ensemble_ = pd.DataFrame()
sub_ensemble_['segment_id'] = segment_id_test
sub_ensemble_['time_to_eruption'] = lgb_y_pred*0.60 + xgb_y_pred*0.30 + forest_y_pred*0.10
sub_ensemble_.to_csv('submission_ensemble_2.csv',index=False)