In [None]:
# Work with Data - the main Python libraries
import numpy as np
import pandas as pd
import pandas_profiling as pp
from collections import defaultdict
# Visualization
import matplotlib.pyplot as plt
import plotly
import plotly.offline as pyo
import plotly.graph_objs as go
# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, ShuffleSplit, GridSearchCV


# Modeling
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.datasets import make_regression
from lightgbm import LGBMRegressor
import lightgbm as lgb
# Metrics
from sklearn.metrics import r2_score


import os
for dirname, _, filenames in os.walk('../input/cryptocurrencypricehistory'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import warnings
warnings.simplefilter('ignore')

In [None]:
pd.set_option('max_colwidth', 200)

In [None]:
# Download data
data = pd.read_csv('../input/wq-southern-bug-river-01052021/PB_All_2000_2021.csv', sep=';', header=0)
data

In [None]:
# Download data about monitoring stations
data_about = pd.read_csv('../input/wq-southern-bug-river-01052021/PB_stations.csv', sep=';', header=0, encoding='cp1251')
data_about.sort_values(by=['length'], ascending=False)

In [None]:
data['id'].value_counts().sort_values().plot(kind='barh')

In [None]:
data['ds'] = pd.to_datetime(data['date'])
data['year'] = data['ds'].dt.year
data.info()

In [None]:
data[['id', 'year']].groupby(by=['id']).min().sort_values(by=['year'], ascending=False)

In [None]:
data[['id', 'year']].groupby(by=['id']).max().sort_values(by=['year'], ascending=False)

In [None]:

stations_good = [14,15,16]
data_about[data_about['id'].isin(stations_good)]

In [None]:
target_data_name = 'Suspended'
#feature_target_all = ['NH4', 'BSK5', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']
#feature_target_all = ['Suspended', 'BSK5','NO3', 'NO2', 'SO4', 'PO4', 'CL']
#feature_target_all = ['Suspended', 'BSK5', 'NO2']
feature_target_all = ['Suspended', 'BSK5']
feature_data_all = feature_target_all #+ [target_data_name]
feature_data_all

In [None]:
df_indicator = data[['id', 'ds'] + feature_data_all]
df_indicator = df_indicator[df_indicator['id'].isin(stations_good)].dropna().reset_index(drop=True)
df_indicator

In [None]:
cols = []
for station in stations_good:
    for feature in feature_data_all:
        cols.append(str(station) + "_" + feature)
cols

In [None]:
df = pd.pivot_table(df_indicator, index=["ds"], columns=["id"], values=feature_data_all).dropna()
df.columns = cols
df

In [None]:
df.info()

In [None]:
df.plot(figsize=(12,10))

In [None]:
df.describe([.05, .5, .96])

In [None]:
cols_anomal = df.columns.tolist()   # All features
#cols_anomal.remove(target_name)     # All features without target
print(cols_anomal)

In [None]:
df_len0 = len(df)
for col in cols_anomal:
    df = df[df[col] <= float(df.quantile([.96])[col])]
df = df.reset_index(drop=True)
print(f"The number of observational data before filtering anomalies - {df_len0} and the number after - {len(df)}")
print(f"It is desirable that after filtering the anomalies there is enough data: at least 65% - {int(0.65*df_len0)}")
df.describe()

In [None]:
df.plot(figsize=(12,10))

In [None]:
#pp.ProfileReport(df)

In [None]:
target_data_name = 'Suspended'
target_name = '14_' + target_data_name
target_data = df.pop(target_name)
target_data

In [None]:
# Dividing data into training and test
train, test, target, target_test = train_test_split(df, target_data, test_size=0.25, random_state=0)
print(train.shape, test.shape)

In [None]:
# Standartization data
scaler = StandardScaler()
train = pd.DataFrame(scaler.fit_transform(train), columns = train.columns)

# Display training data
train

In [None]:
test = pd.DataFrame(scaler.transform(test), columns = test.columns)

In [None]:
train_all = train.copy()
target_all = target.copy()
train, valid, target_train, target_valid = train_test_split(train_all, target_all, test_size=0.2, random_state=0)

In [None]:
#cv_train = ShuffleSplit(n_splits=3, test_size=0.5, random_state=0)
cv_train = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
cv_train

In [None]:
# Creation the dataframe with the resulting score of all models
result = pd.DataFrame({'model' : ['Linear Regression', 'Random Forest Regressor', 
                                  'XGBoost Regressor', 'LGB', 'Average prediction'], 
                       'train_score': 0, 'valid_score': 0})
result

Моделювання 

In [None]:
# Linear Regression
lr = LinearRegression()
lr.fit(train, target_train)

# Prediction for training data
y_train_lr = lr.predict(train)

# Accuracy of model
r2_score_acc = round(r2_score(target_train, y_train_lr), 2)
print(f'Accuracy of Linear Regression model training is {r2_score_acc}')

# Save to result dataframe
result.loc[result['model'] == 'Linear Regression', 'train_score'] = r2_score_acc

In [None]:
# Print rounded r2_lr = lr.predict(valid)
y_val_lr = lr.predict(valid)
r2_score_acc_valid = round(r2_score(target_valid, y_val_lr),2)
result.loc[result['model'] == 'Linear Regression', 'valid_score'] = r2_score_acc_valid
print(f'Accuracy of Linear Regression model prediction for valid dataset is {r2_score_acc_valid}')

LGBM

In [None]:
lgb_train = lgb.Dataset(train, target_train)
lgb_eval = lgb.Dataset(valid, target_valid,  reference=lgb_train)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 5,
    'learning_rate': 0.05,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.4,
    'bagging_freq': 5,
    'verbose': 0
}
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=5,
                valid_sets=lgb_eval,
                early_stopping_rounds=3)

y_train_lgb = gbm.predict(train)
# Accuracy of model
r2_score_acc = round(r2_score(target_train, y_train_lgb), 5)
print(f'Accuracy of Linear Regression model training is {r2_score_acc}')

# Save to result dataframe
result.loc[result['model'] == 'LGB', 'train_score'] = r2_score_acc

In [None]:
#ret = mean_squared_error(valid, gbm.predict(target_valid))
y_val_lgb = gbm.predict(valid)
r2_score_acc_valid = round(r2_score(target_valid, y_val_lgb),5)
result.loc[result['model'] == 'LGB', 'valid_score'] = r2_score_acc_valid
print(f'Accuracy of LGB model prediction for valid dataset is {r2_score_acc_valid}')
#self.assertLess(ret, 16)
#self.assertAlmostEqual(evals_result['valid_0']['l2'][-1], ret, places=5)

In [None]:
ax = lgb.plot_importance(gbm, max_num_features=40, figsize=(15,15))
plt.show()

In [None]:
%%time
# XGBoost Regressor
xgbr = xgb.XGBRegressor() 
# parameters = {'n_estimators': [60, 70, 80, 90, 95, 100, 105, 110, 120, 130, 140], 
#               'learning_rate': [0.005, 0.01, 0.05, 0.075, 0.1],
#               'max_depth': [3, 5, 7, 9],
#               'reg_lambda': [0.1, 0.3, 0.5]}

parameters = {'n_estimators': [60, 75, ], 
              'learning_rate': [0.02, 0.05],
              'max_depth': [4, 8],
              'reg_lambda': [0.1, 0.3]}

# Training model
xgb_CV = GridSearchCV(estimator=xgbr, param_grid=parameters, cv=cv_train, n_jobs=-1)
xgb_CV.fit(train, target_train)
print("Best score: %0.3f" % xgb_CV.best_score_)
print("Best parameters set:", xgb_CV.best_params_)

# Prediction for training data
y_train_xgb = xgb_CV.predict(train)

# Accuracy of model
r2_score_acc = round(r2_score(target_train, y_train_xgb),2)
print(f'Accuracy of XGBoost Regressor model training is {r2_score_acc}')

# Save to result dataframe
result.loc[result['model'] == 'XGBoost Regressor', 'train_score'] = r2_score_acc

In [None]:
y_val_xgb = xgb_CV.predict(valid)
r2_score_acc_valid = round(r2_score(target_valid, y_val_xgb),2)
result.loc[result['model'] == 'XGBoost Regressor', 'valid_score'] = r2_score_acc_valid
print(f'Accuracy of XGBoost Regressor model prediction for valid dataset is {r2_score_acc_valid}')

In [None]:
xgbr = xgb.XGBRegressor(**xgb_CV.best_params_)
xgbr.fit(train, target_train)
fig =  plt.figure(figsize = (10,8))
axes = fig.add_subplot(111)
xgb.plot_importance(xgbr,ax = axes,height = 0.5)
plt.show();
plt.close()

In [None]:
%%time
# Random Forest Regressor
rf = RandomForestRegressor()
param_grid = {'n_estimators': [15, 20, 25], 'min_samples_leaf': [i for i in range(4,7)], 
              'max_features': ['auto'], 'max_depth': [i for i in range(3,6)], 
              'criterion': ['mse'], 'bootstrap': [False]}

# Training model
rf_CV = GridSearchCV(rf, param_grid=param_grid, cv=cv_train, verbose=False)
rf_CV.fit(train, target_train)
print(rf_CV.best_params_)

# Prediction for training data
y_train_rf = rf_CV.predict(train)

# Accuracy of model
r2_score_acc = round(r2_score(target_train, y_train_rf),2)
print(f'Accuracy of RandomForestRegressor model training is {r2_score_acc}')

# Save to result dataframe
result.loc[result['model'] == 'Random Forest Regressor', 'train_score'] = r2_score_acc

In [None]:
# Print rounded r2_score_acc to 2 decimal values after the text
y_val_rf = rf_CV.predict(valid)
r2_score_acc_valid = round(r2_score(target_valid, y_val_rf),2)
result.loc[result['model'] == 'Random Forest Regressor', 'valid_score'] = r2_score_acc_valid
print(f'Accuracy of RandomForestRegressor model prediction for valid dataset is {r2_score_acc_valid}')

In [None]:
def agg(x1, x2, x3, x4):
    # Aggregation of x1, x2 and x3 predictions
    #return (x1 + x2 + x3) / 3
    #return list(np.maximum(np.array(x1), np.array(x2), np.array(x3)))
    return list(np.minimum(np.array(x1), np.array(x2), np.array(x3), np.array(x4)))

In [None]:
# Average prediction for training dataset
y_train = agg(y_train_lr, y_train_rf, y_train_xgb, y_train_lgb)

# Accuracy of model
r2_score_acc = round(r2_score(target_train, y_train),2)
print(f'Accuracy of Average prediction is {r2_score_acc}')

# Save to result dataframe
result.loc[result['model'] == 'Average prediction', 'train_score'] = r2_score_acc

In [None]:
# Average prediction for validation dataset
y_val = agg(y_val_lr, y_val_rf, y_val_xgb, y_val_lgb)
r2_score_acc_valid = round(r2_score(target_valid, y_val),2)
print(f'Accuracy of Average prediction for valid dataset is {r2_score_acc_valid}')
result.loc[result['model'] == 'Average prediction', 'valid_score'] = r2_score_acc_valid

In [None]:
# Prediction of target for test data for all models
y_test_lr = lr.predict(test)
y_test_rf = rf_CV.predict(test)
y_test_xgb = xgb_CV.predict(test)
y_test_lgb = gbm.predict(test)
y_test = agg(y_test_lr, y_test_rf, y_test_xgb, y_test_lgb)

In [None]:
x = np.arange(len(train))
plt.figure(figsize=(16,10))
plt.scatter(x, target_train, label = "Target training data", color = 'k')
plt.scatter(x, y_train_lr, label = "Linear Regression prediction", color = 'b')
plt.scatter(x, y_train_rf, label = "Random Forest prediction", color = 'y')
plt.scatter(x, y_train_xgb, label = "XGBoost Regressor prediction", color = 'brown')
plt.scatter(x, y_train_gbm, label = "LGBM", color = 'pink')
plt.scatter(x, y_train, label = "Average prediction", color = 'g')
plt.plot(x, np.full(len(train), 15), label = "Maximum allowable value", color = 'r')
plt.title('Prediction for the training data')
plt.legend(loc='best')
plt.grid(True)

In [None]:
# Building plot for prediction for the valid data 
x = np.arange(len(valid))
plt.figure(figsize=(16,10))
plt.scatter(x, target_valid, label = "Target valid data", color = 'k')
plt.scatter(x, y_val_lr, label = "Linear Regression prediction", color = 'b')
plt.scatter(x, y_val_rf, label = "Random Forest prediction", color = 'y')
plt.scatter(x, y_val_xgb, label = "XGBoost Regressor prediction", color = 'brown')
plt.scatter(x, y_val_gbm, label = "LGBM", color = 'pink')
plt.scatter(x, y_val, label = "Average prediction", color = 'g')
plt.plot(x, np.full(len(valid), 0.5), label = "Maximum allowable value", color = 'r')
plt.title('Prediction for the valid data')
plt.legend(loc='best')
plt.grid(True)

In [None]:
x = np.arange(len(test))
plt.figure(figsize=(16,10))
plt.scatter(x, target_test, label = "Target test data", color = 'k')
plt.scatter(x, y_test_lr, label = "Linear Regression prediction", color = 'b')
plt.scatter(x, y_test_rf, label = "Random Forest prediction", color = 'y')
plt.scatter(x, y_test_xgb, label = "XGBoost Regressor prediction", color = 'brown')
plt.scatter(x, y_test_gbm, label = "LGBM", color = 'pink')
plt.scatter(x, y_test, label = "Average prediction", color = 'g')
plt.plot(x, np.full(len(test), 0.5), label = "Maximum allowable value", color = 'r')
plt.title('Prediction for the test data')
plt.legend(loc='best')
plt.grid(True)

In [None]:

result.sort_values(by=['valid_score', 'train_score'], ascending=False)

In [None]:
# Select models with minimal overfitting
result_best = result[(result['train_score'] - result['valid_score']).abs() < 0.05]
result_best.sort_values(by=['valid_score', 'train_score'], ascending=False)

In [None]:
# Select the best model
result_best.nlargest(1, 'valid_score')

In [None]:
# Find a name of the best model (with maximal valid score)
best_model_name = result_best.loc[result_best['valid_score'].idxmax(result_best['valid_score'].max()), 'model']


In [None]:
print(f'The best model is "{best_model_name}"')