#Data Ready & Basic Analysis:

In [None]:
# Import the numpy and pandas package
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

# Data Visualisation
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
#To load dataset in google colab from github
df1 = pd.read_csv("Mercury_Predictions_1.csv")

In [None]:
df1.head(5)

In [None]:

df2 = pd.read_csv("Mercury_Predictions_2.csv")

In [None]:
df2.head(5)

In [None]:
df1.dtypes

In [None]:
df2.dtypes

In [None]:
#For joining two datasets
df3 = pd.merge(df1, df2)
df3.head(5)

In [None]:
df3.shape

In [None]:
#Attribute length
len(set(df3))

In [None]:
df3.describe()

In [None]:
df3.info()

In [None]:
data = df3[['Reservoir', 'Mercury']]

In [None]:
plt.figure(figsize = (10, 6))
plt.plot(data['Mercury'])
plt.title('Mercury level among Fish')
plt.ylabel('Mercury')
plt.xlabel('Reservoir')
plt.show()

# Data Cleaning & Train Test Split:

In [None]:
df3.isnull().sum()

In [None]:
#Filling null values with median
df3[['Drainage Area', 'RF', 'FR']] = df3[['Drainage Area', 'RF', 'FR']].fillna(df3[['Drainage Area', 'RF', 'FR']].median())

In [None]:
df3.isnull().sum()

In [None]:
df3.dropna(inplace=True)

In [None]:
df3.isnull().sum()

In [None]:
df3.shape

In [None]:
#features
feature = df3
feature = feature.drop('Reservoir', axis=1)
feature = feature.drop('Mercury', axis=1)

In [None]:
feature.shape

In [None]:
#label
label = df3['Mercury']

In [None]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(feature, label,test_size=.2,random_state=42)

In [None]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

# Feature Engineering:

In [None]:
import statsmodels.api as sm
train_x_sm = sm.add_constant(train_x)
train_x_sm.head(5)

In [None]:
lr = sm.OLS(train_y, train_x).fit()
lr.summary()

Those columns are taken where p_value<0.5

In [None]:
train_x.head(5)

In [None]:
train_y.head()

So, now a new dataset to be extracted from previous dataset with attributes having p value <0.5

In [None]:
df_main = df3[['Mercury', 'Elevation', 'Max Depth', 'FR', 'RS', 'LONGITUDE_SECONDS']]

In [None]:
df_main.head(5)

Again Train Test Split for extracted dataset: df_main

In [None]:
#features
feature = df_main
feature = feature.drop('Mercury', axis=1)

In [None]:
feature.shape

In [None]:
#labels
label = df3['Mercury']

In [None]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(feature, label,test_size=.2,random_state=42)

In [None]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_x, train_y)

In [None]:
lr.score(test_x, test_y)

In [None]:
pred_y = lr.predict(test_x)
pred_y

In [None]:
import math  
import sklearn.metrics
mse = sklearn.metrics.mean_squared_error(test_y, pred_y)  
rmse = math.sqrt(mse)
rmse

#Visualizaion, Cross Validation & Model Building:

Most important columns are taken for ploting

In [None]:
df_main.columns

In [None]:
sns.pairplot(df_main)

In [None]:
plt.scatter(df_main['Mercury'], df_main['Elevation'])
plt.plot(df_main['Mercury'], 10 + 1050*df_main['Mercury'], 'r')
plt.title('Mercury level vs Elevation')
plt.ylabel('Mercury')
plt.xlabel('Elevation')
plt.show()

In [None]:
plt.scatter(df_main['Mercury'], df_main['Max Depth'])
plt.plot(df_main['Mercury'], 10 + 50*df_main['Mercury'], 'r')
plt.title('Mercury level vs Max Depth')
plt.ylabel('Mercury')
plt.xlabel('Max Depth')
plt.show()

In [None]:
plt.scatter(df_main['Mercury'], df_main['FR'])
plt.plot(df_main['Mercury'], 1.8 + 1*df_main['Mercury'], 'r')
plt.title('Mercury level vs FR')
plt.ylabel('Mercury')
plt.xlabel('FR')
plt.show()

In [None]:
plt.scatter(df_main['Mercury'], df_main['RS'])
plt.plot(df_main['Mercury'], .5 + .001*df_main['Mercury'], 'r')
plt.title('Mercury level vs RS')
plt.ylabel('Mercury')
plt.xlabel('RS')
plt.show()

In [None]:
plt.scatter(df_main['Mercury'], df_main['LONGITUDE_SECONDS'])
plt.plot(df_main['Mercury'], 8 + 45*df_main['Mercury'], 'r')
plt.title('Mercury level vs LONGITUDE_SECONDS')
plt.ylabel('Mercury')
plt.xlabel('LONGITUDE_SECONDS')
plt.show()

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean, sqrt, absolute
import math  
import sklearn.metrics

lm = LinearRegression()
scores = cross_val_score(lm, train_x, train_y, scoring='r2', cv=10)
absolute(scores)

In [None]:
mean(absolute(scores))

In [None]:
from sklearn.metrics import r2_score
r2_score(test_y, pred_y)

In [None]:
lm = LinearRegression()
score_1 = cross_val_score(lm, train_x, train_y, scoring='r2', cv=10)
score_2 = cross_val_score(lm, train_x, train_y, scoring='neg_root_mean_squared_error', cv=10) 

In [None]:
score_1

In [None]:
score_1.mean()

R2 score can be negative because of small dataset and noise in the dataset

In [None]:
np.absolute(score_2)

In [None]:
np.absolute(score_2.mean())