In [None]:
# Import libraryes
import numpy as np # For math operations
import pandas as pd # For the import data
from pandas.plotting import scatter_matrix
import matplotlib as mp
import seaborn as sns
import matplotlib.pyplot as plt
# For the scaling data


from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

In [None]:
data = pd.read_csv('../input/dissolved-oxygen-prediction-in-river-water/train.csv')

In [None]:
data.head(10)

In [None]:
data.info()

# **Data Exploration**

In [None]:
data.isna().sum()

**To make the analysis more accurate, delete all columns with an empty data set**

In [None]:
null_columns = list(data.columns[data.isna().sum() > 100])
data.drop(null_columns, axis=1, inplace=True)

In [None]:
data

In [None]:
data.isna().sum()

In [None]:
print("Columns with missing values:", (data.isna().sum(axis=0) != 0).sum())
print("Rows with missing values:", (data.isna().sum(axis=1) != 0).sum())

In [None]:
data.dropna(axis=0, inplace=True)

In [None]:
data.isna().sum().sum()

In [None]:
data

In [None]:
data.describe()

**Due to the fact that we work only with oxygen concentration, we can create our own data sample, which will include columns "target", "O2_1" and "O2_2"**

In [None]:
d1 = data[['target',"O2_1", "O2_2"]]
d1

# **Data Visualization**

In [None]:
d1.plot(kind='box')

In [None]:
scatter_matrix(d1, figsize=(10,7))

In [None]:
a1 = sns.distplot(d1)

**To create a better picture of events, create a line graph**

In [None]:
d1[['O2_1', 'O2_2']].plot(kind="line", figsize=(8, 4))

In [None]:
d1.describe()

As a result of the received information, it is possible to draw a conclusion that oxygen concentration makes about 46,95 and 40,9. Because this statement is highly questionable, it may be erroneous. For accuracy, this data will be replaced by the average value of the corresponding columns


In [None]:
d2 = d1.replace(d1["O2_1"].max(), value=d1["O2_1"].mean())
d3 = d2.replace(d1["O2_2"].max(), value=d1["O2_2"].mean())

d3[['target','O2_1' ,'O2_2']].plot(kind="line", figsize=(7,4))

In [None]:
a = sns.distplot(d3)

# **Data Modeling**

In [None]:
X = d1.drop('target', axis=1)
y = d1[['target']]

In [None]:
x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=35)
X_train = preprocessing.StandardScaler().fit(x_train).transform(x_train)
X_test = preprocessing.StandardScaler().fit(x_test).transform(x_test)


## change to 1d array
y_train = np.array(y_train)
y_train = y_train.ravel()

y_test = np.array(y_test)
y_test = y_test.ravel()

**Regression**

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn import linear_model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [None]:
def model_result(m_odel):
    m = m_odel
    m.fit(X_train, y_train)
    print('R2 model score:  ', m.score(X_test, y_test))
    print('RMSE    :  ', np.sqrt(mean_squared_error(m.predict(X_test), y_test)))

In [None]:
#testing ridge regression
from sklearn.linear_model import Ridge
model_result(m_odel=Ridge(alpha = 1, random_state = 42))

In [None]:
#testing Lasso regression
from sklearn.linear_model import Lasso
model_result(m_odel=Lasso(alpha = 1, random_state = 42))

In [None]:
#polynomial reg
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)

X_poly = poly_features.fit_transform(X)

Xp_train, Xp_test, yp_train, yp_test = train_test_split(X_poly, y, test_size=0.2, random_state=35)

yp_train = np.array(yp_train)
yp_train = yp_train.ravel()

yp_test = np.array(yp_test)
yp_test = yp_test.ravel()
def model_resultp(m_odelp):
    mp = m_odelp
    mp.fit(Xp_train, yp_train)
    print('R2 model score =',mp.score(Xp_test, yp_test))
    print('RMSE =',np.sqrt(mean_squared_error(mp.predict(Xp_test), yp_test)))
model_resultp(m_odelp=linear_model.LinearRegression(fit_intercept=True))

**Vector Model**

In [None]:
##support vector regressor

from sklearn.svm import LinearSVR

svm_reg = LinearSVR(epsilon=2.5543, random_state=42)
svm_reg.fit(X_train, y_train)
svm_reg.score(X_test, y_test)

In [None]:

from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)
Xp_train, Xp_test, yp_train, yp_test = train_test_split(X_poly, y, test_size=0.2, random_state=35)

my_model = Lasso(alpha=0.25455422642263903, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

my_model.fit(Xp_train, yp_train)
my_model.score(Xp_test, yp_test)

In [None]:
#cross validation on polynomial features

modelsT = [Ridge, Lasso]
model_names = ['ridge', 'lasso']

for x in range(len(modelsT)):
    print(model_names[x])
    
    param_grid = {'alpha' : np.logspace(-1,0.009,2500),
                  'max_iter' : [1000]}
    lin_model  = modelsT[x]() 
    model_cv   = GridSearchCV(estimator  = lin_model, 
                        param_grid = [param_grid],
                        cv = 5,
                        scoring='neg_mean_squared_error', 
                        n_jobs = -1,
                        verbose = 1)
    model_cv.fit(Xp_train, yp_train)

    best_model              = model_cv.best_estimator_
    print(best_model)
    bestmodelFitTime        = model_cv.cv_results_['mean_fit_time'][model_cv.best_index_]
    bestmodelScoreTime      = model_cv.cv_results_['mean_score_time'][model_cv.best_index_]
    best_model.fit(Xp_train, yp_train)
    print('R2 score: ', best_model.score(Xp_test, yp_test))

    
    y_pred = best_model.predict(Xp_test)
    rmse   = np.sqrt(mean_squared_error(y_pred, yp_test))
    print('Test RMSE : ', rmse)
    print("**********************************")

The best performing model turned out to be the Lasso regression with alpha = 0.254. An accuracy of 72.69% was achieved!

In [None]:
my_model = Lasso(alpha=0.25455422642263903, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

my_model.fit(Xp_train, yp_train)
my_model.score(Xp_test, yp_test)

In [None]:
my_model.predict(Xp_test)

# **Predicting the target station from test data**

In [None]:
Test = pd.read_csv("../input/dissolved-oxygen-prediction-in-river-water/test.csv")
Test

In [None]:
Test.describe()

In [None]:
Test = Test.replace(Test["O2_2"].max(), value=Test["O2_2"].mean())
Test_X = Test[["O2_1", "O2_2"]]
Test_X

In [None]:
poly_features_X = PolynomialFeatures(degree=2, include_bias=False)

Tes_X_poly = poly_features_X.fit_transform(Test_X)


In [None]:
my_model.predict(Tes_X_poly)

**Conclusion**

The provided data set contained a large number of gaps, as a result of which the model was built on two function sets. Stations 1 and 2. The best indicator that was made as a result of the analysis - 72%
