In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
bottle_df = pd.read_csv("../input/calcofi/bottle.csv")
cast_df = pd.read_csv("../input/calcofi/cast.csv")

In [None]:
bottle_df.info()

In [None]:
bottle_df.tail(5)

In [None]:
cast_df.info()

In [None]:
cast_df.tail(5)

My task is to predict the temperature of the water, all these details are available in bottle_df. Therefore, we shall be ignoring the cast_df for the rest of the notebook.


In [None]:
bottle_df.describe()

While predicting the temperature let's use only features Depth and Salinity

In [None]:
bottle_df_cust = bottle_df[['Salnty','Depthm','T_degC']].copy()
#bottle_df_cust = bottle_df.loc[1:1000,['Salnty','Depthm','T_degC']]

In [None]:
bottle_df_cust.head()

In [None]:
bottle_df_cust.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer
si = SimpleImputer()
bottle_df_cust = pd.DataFrame(si.fit_transform(bottle_df_cust),columns = ['Salnty','Depthm','T_degC'])

In [None]:
bottle_df_cust.isnull().sum()

In [None]:
import seaborn as sns
sns.pairplot(bottle_df_cust,x_vars=['Salnty','Depthm'],y_vars=['T_degC'])
plt.show()

The Depth vs Temparature graph shows that the as the depth increased the temparature has reduced. Whereas in the case of Salinity vs Temparature, there is no particular trend in the data.

In [None]:
sns.pairplot(bottle_df_cust,x_vars=['Salnty'],y_vars=['Depthm'])
plt.show()

We can see that for the majority of the datapoints the Salinity of the water is close to 35 irrespective of the depth at which the sample was taken

In [None]:
bottle_df_cust.describe()

In [None]:
plt.figure(figsize=[24,6])
plt.subplot(1,2,1)
sns.boxplot(data = bottle_df_cust,y='Salnty')
plt.subplot(1,2,2)
sns.boxplot(data = bottle_df_cust,y='Depthm')
plt.show()

Building the Regression Models

In [None]:
X = bottle_df_cust[['Salnty','Depthm']].copy()
Y = bottle_df_cust['T_degC'].copy()

print(X.shape,Y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=39)

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
from sklearn import linear_model
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

In [None]:
print("Coefficients of the model :",lm.coef_)
print("Intercept of the model :",lm.intercept_)

In [None]:
y_pred = lm.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
print("Mean absolute error: %.2f" % np.mean(np.absolute(y_pred - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((y_pred - y_test) ** 2))
print("R2-score: %.2f" % r2_score(y_pred , y_test) )

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_train_poly = poly_reg.fit_transform(X_train)

In [None]:
lm.fit(X_train_poly,y_train)

In [None]:
print("Coefficients of the model :",lm.coef_)
print("Intercept of the model :",lm.intercept_)

In [None]:
X_test_poly = poly_reg.fit_transform(X_test)
y_pred = lm.predict(X_test_poly)

In [None]:
from sklearn.metrics import r2_score
print("Mean absolute error: %.2f" % np.mean(np.absolute(y_pred - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((y_pred - y_test) ** 2))
print("R2-score: %.2f" % r2_score(y_pred , y_test) )

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 50)
rf.fit(X_train,y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
print("Mean absolute error: %.2f" % np.mean(np.absolute(y_pred - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((y_pred - y_test) ** 2))
print("R2-score: %.2f" % r2_score(y_pred , y_test) )

In [None]:
rf.fit(X_train_poly,y_train)
y_pred = rf.predict(X_test_poly)

In [None]:
from sklearn.metrics import r2_score
print("Mean absolute error: %.2f" % np.mean(np.absolute(y_pred - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((y_pred - y_test) ** 2))
print("R2-score: %.2f" % r2_score(y_pred , y_test) )