# Predicting the temperature of water based on its salinity

In [None]:
import pandas as pd #for handling data and dataframe
import seaborn as sns #for plotting different curves and graphs
import numpy as np #for algebraic manipulation of the datasets
import matplotlib.pyplot as plt

## Loading the data set


In [None]:
df=pd.read_csv("../input/calcofi/bottle.csv", low_memory=False) #df.count() use this to count the no of items column wise in the datset. There is a total of 864863 elements
df.head() #first five rows of the dataset will be returned


### selecting temperature(T_degC) and salinity(Salnty) columns to evaluate further

In [None]:
df=df[['Salnty','T_degC']]
df


## Ploting a scatterplot graph to visualize the data

In [None]:
ax=sns.scatterplot(x=df.Salnty, y=df.T_degC)
ax.set(title='temperature versus salinity graph')

## Dealing with the missing data

In [None]:
df.isnull().sum()   #finds the number of missing values in the dataset

In [None]:
#using simple imputer function we fill the missing data with the mean value of the attribute 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = "most_frequent")
imputer.fit(df)
df = imputer.transform(df)
df

## Creating the indepent(x) and dependent (y) variables


In [None]:
X=df[:,0].reshape(-1, 1) #obtaining the first column
Y=df[:,1]#obtaining the second column
print(X)
print(Y)

## Splitting the dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split #to split the data set into tainh test data
X_train,X_test,y_train,y_test=train_test_split(X, Y, test_size = 0.25, random_state = 0) #test_size=0.2 implies that the test part of the data will be 20%

In [None]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

## Using the training part of the data to train the simple regression model

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train,y_train)


# Predicting the test results 

In [None]:
y_pred = reg.predict(X_test)
y_pred

# Visualizing the trained data 

In [None]:
ax=sns.scatterplot(X_train.reshape((-1)),y_train) #creates a graph of train data temperature vs salinity 
ax.set(xlabel="Salinity",ylabel='Teperature',title='TEMPERATURE VS SALINITY  ')
sns.scatterplot(x=X_train.reshape((-1)),y=reg.predict(X_train))#creates a graph of salinity and temperature. Here temperature value is predicted using salinity(train).the points therefore lie on a regression line


## Effeciency of the model

In [None]:
# regression coefficients
print('Coefficients:',reg.coef_)
# regression intercept
print('Intercept:' ,reg.intercept_)

In [None]:
#variance score or the R-squared value: 1 means perfect prediction
print('Variance score: {}'.format(reg.score(X_test, y_test)))