In [None]:
# Loading of libraries 
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
d = pd.read_csv("../input/dissolved-oxygen-prediction-in-river-water/train.csv")
d.head(5)

In [None]:
# Here we have a statistical look at the data
d.describe()

## Data Exploration

In [None]:
d.isna().sum()

###### There are high rates of null values from stations 3 - 7 within the entire data set.

In [None]:
#Our total rows and columns
d.shape

##### For predicting O2 levels at the target station, we will use O2 reading from station 1 and 2 which has significantly less missing data compared to over 50% in the other columns.

In [None]:
#Creating our sample data set.

d1 = d[['target',"O2_1", "O2_2"]]
d1

In [None]:
d1.isna().sum()

In [None]:
# dropping the two missing values
d2 = d1.dropna()
d2

In [None]:
d2.isna().sum()

## Data Visualization

In [None]:
# First a scatter plot is created to view the distribution of the data.

scatter_matrix(d2, figsize=(10,5))

In [None]:
a1 = sns.distplot(d2)

In [None]:
d2.plot(kind='box')


###### A few sigle widely dispursed outlaires can be observed from the scatter plots and further observed in the histogram.
###### In order to get a better picture, a line graph wil now be created.

In [None]:
d2.plot(kind="line", figsize=(8,4))

###### There seems to be an abnormal spike in O2 levels at two of the station. Lets have a closer look

In [None]:
d2[['O2_1', 'O2_2']].plot(kind="line", figsize=(8, 4))

In [None]:
d2.describe()

###### Concentrations of approximately 46.95 and 40.9 are observed at this stations which is clearly abnormal. From further [investigations](https://www.fondriest.com/environmental-measurements/parameters/water-quality/dissolved-oxygen/#:~:text=As%20oxygen%20in%20the%20atmosphere,100%25%20air%20saturation%20at%20equilibrium.), such high levels of O2 in river water, especially in this data set, might be an error.
###### These vaues will be removed and replaced with the mean of the respective column.

In [None]:
#replacing the maximum values which seem to be outlairs with a mean value
d3 = d2.replace(d2["O2_1"].max(), value=d2["O2_1"].mean())
d4 = d3.replace(d2["O2_2"].max(), value=d2["O2_2"].mean())

d4[['target','O2_1' ,'O2_2']].plot(kind="line", figsize=(7,4))

In [None]:
a = sns.distplot(d4)

###### With the outlaired removed, we can move onto the next step.

### Data Modeling

In [None]:
# Creating the target and feature sets

from sklearn.model_selection import train_test_split
from sklearn import metrics

X = d2.drop('target', axis=1)
y = d2[['target']]


In [None]:
# Creation of the train test split sets as well as standardisation of the data.

x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=35)
X_train = preprocessing.StandardScaler().fit(x_train).transform(x_train)
X_test = preprocessing.StandardScaler().fit(x_test).transform(x_test)


## change to 1d array
y_train = np.array(y_train)
y_train = y_train.ravel()

y_test = np.array(y_test)
y_test = y_test.ravel()


### Testing various regression models

In [None]:
# Testing a simple linear regression

from sklearn.metrics import mean_squared_error
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train, y_train)


print('R2 model score:  ', reg.score(X_test, y_test))
print('RMSE    :  ', np.sqrt(mean_squared_error(reg.predict(X_test), y_test)))

In [None]:
#define a function to test several regression models

def model_result(m_odel):
    m = m_odel
    m.fit(X_train, y_train)
    print('R2 model score:  ', m.score(X_test, y_test))
    print('RMSE    :  ', np.sqrt(mean_squared_error(m.predict(X_test), y_test)))
    

In [None]:
#testing ridge regression
from sklearn.linear_model import Ridge
model_result(m_odel=Ridge(alpha = 1, random_state = 42))

In [None]:
#testing Lasso regression
from sklearn.linear_model import Lasso
model_result(m_odel=Lasso(alpha = 1, random_state = 42))

In [None]:
#testing huber regression
from sklearn.linear_model import HuberRegressor
model_result(m_odel=HuberRegressor())

In [None]:
from sklearn.linear_model import ElasticNet
model_result(m_odel=ElasticNet(alpha = 1, random_state = 42))

##### From our observation so far, the Ridge regression model had the highest accuracy at **67.38%**

## Testing a support vector model

In [None]:
##support vector regressor

from sklearn.svm import LinearSVR

svm_reg = LinearSVR(epsilon=2.5543, random_state=42)
svm_reg.fit(X_train, y_train)
svm_reg.score(X_test, y_test)

###### The accuracy is not as high as with the linear model

## Testing a Random forest model

In [None]:
#random forest

from sklearn.ensemble import RandomForestRegressor
ftree = RandomForestRegressor(max_depth=2,random_state=42)
ftree.fit(X_train, y_train)

In [None]:
pre = ftree.predict(X_test)
f = mean_squared_error(y_test, pre)
Fs = np.sqrt(f)
print("RMSE",Fs)
print("Accuracy", ftree.score(X_test, y_test))

###### We also note that the random forest performs even more poorly.

### In this step we try using polynomial features see the effect on the accuracy. This involves:

##### - importing polynomial features
##### - transforming features
##### - creating new train test splits

In [None]:

#Testing polynomial regression

from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)

X_poly = poly_features.fit_transform(X)

Xp_train, Xp_test, yp_train, yp_test = train_test_split(X_poly, y, test_size=0.2, random_state=35)

yp_train = np.array(yp_train)
yp_train = yp_train.ravel()

yp_test = np.array(yp_test)
yp_test = yp_test.ravel()


In [None]:
# define mode to test accuracy

def model_resultp(m_odelp):
    mp = m_odelp
    mp.fit(Xp_train, yp_train)
    print('R2 model score =',mp.score(Xp_test, yp_test))
    print('RMSE =',np.sqrt(mean_squared_error(mp.predict(Xp_test), yp_test)))
  

In [None]:
# Here we test the accuracy of the linear model with polynomial features

model_resultp(m_odelp=linear_model.LinearRegression(fit_intercept=True))

In [None]:
# Here we test the accuracy of the Ridge regression model with polynomial features

model_resultp(m_odelp=Ridge(alpha = 1, random_state = 42))

In [None]:
# Here we test the accuracy of the Lasso regression model with polynomial features

model_resultp(m_odelp=Lasso(alpha = 1, random_state = 42))

In [None]:
# Here we test the accuracy of the Huber regression model with polynomial features

model_resultp(m_odelp=HuberRegressor())

In [None]:
# Here we test the accuracy of the Elastic net model with polynomial features

model_resultp(m_odelp=ElasticNet(alpha = 1, random_state = 42))

##### There is a huge improvement in model accuracy when using polynomial features. Results all range between **71** and **72%!**

## Model Fine Tuning.

##### Now that we have isolated the best performing models, we will now twick the parameters in order to get the highest accuracy

##### First we will tune the hyper parameters for the **ridge regression model with linear features**.

In [None]:
#Tunning for ridge regression

param_grid2 = [
 
 {'alpha' : np.logspace(-1,0.00001,1000), 'max_iter' : [1000], 
  "fit_intercept": [True, False], "solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}]
model2 = Ridge(random_state=42 )
grid_search2 = GridSearchCV(model2, param_grid2, cv=5,
 scoring='neg_mean_squared_error',
return_train_score=True)
grid_search2.fit(X_train, y_train)

RRg = grid_search2.best_estimator_
FL1 = RRg.predict(X_test)

rmse3 = np.sqrt(mean_squared_error(FL1, y_test))



print("RSE",rmse3)
ridge_score = RRg.score(X_test, y_test)
print("accuracy score", ridge_score)

##### Aftertuning, the ridge regression modelimproved slightly from to **67.38%** to **67.40%**

##### Now cross validation will be used to fine the best model with **polynomial features**.

In [None]:
#cross validation on polynomial features

modelsT = [Ridge, Lasso, ElasticNet, HuberRegressor]
model_names = ['ridge', 'lasso', 'elasticnet', 'huber']

for x in range(len(modelsT)):
    print(model_names[x])
    
    param_grid = {'alpha' : np.logspace(-1,0.009,2500),
                  'max_iter' : [1000]}
    lin_model  = modelsT[x]() 
    model_cv   = GridSearchCV(estimator  = lin_model, 
                        param_grid = [param_grid],
                        cv = 5,
                        scoring='neg_mean_squared_error', 
                        n_jobs = -1,
                        verbose = 1)
    model_cv.fit(Xp_train, yp_train)

    best_model              = model_cv.best_estimator_
    print(best_model)
    bestmodelFitTime        = model_cv.cv_results_['mean_fit_time'][model_cv.best_index_]
    bestmodelScoreTime      = model_cv.cv_results_['mean_score_time'][model_cv.best_index_]
    best_model.fit(Xp_train, yp_train)
    print('R2 score: ', best_model.score(Xp_test, yp_test))

    
    y_pred = best_model.predict(Xp_test)
    rmse   = np.sqrt(mean_squared_error(y_pred, yp_test))
    print('Test RMSE : ', rmse)
    print("**********************************")

#### The best performing model turned out to be the Lasso regression with alpha = 0.254. An accuracy of **72.69%** was achieved!

In [None]:
my_model = Lasso(alpha=0.25455422642263903, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

my_model.fit(Xp_train, yp_train)
my_model.score(Xp_test, yp_test)

In [None]:
# Predicting the target 02 concentrations in the remainded of the test set.
my_model.predict(Xp_test)

## Predicting the target station from test data

In [None]:
##Loading test data
Test = pd.read_csv("../input/dissolved-oxygen-prediction-in-river-water/test.csv")
Test

In [None]:
Test.describe()

In [None]:
# We notice there is an outlier in O2_2 measurements so we replace it with the mean
Test = Test.replace(Test["O2_2"].max(), value=Test["O2_2"].mean())

###### Creating our data set with O2 measures

In [None]:
#creating o2 data set
Test_X = Test[["O2_1", "O2_2"]]
Test_X

In [None]:
# Creating and fitting ploynomial features

poly_features_X = PolynomialFeatures(degree=2, include_bias=False)

Tes_X_poly = poly_features_X.fit_transform(Test_X)

##### Test set O2 predictions

In [None]:
# Here we predict the target station O2 concentration from test set

my_model.predict(Tes_X_poly)

### Conclusion


##### There were high amounts of null values in the data set and as a result the model had to be built on 2 feature sets - station 1 and 2. Further more there were outlier values not only present in the O2 data but within the entire data set. The final, best accuracy score which I was able to get, given these constraints was 72%. 