In [None]:
# Here is NY Watersheds which shows water quality values sampled from many locations. 
# This analysis is to show the healtiness level of surface water bodies in NY 
# Data includes 4-hour-Turbidity values (six times a day), Daily Average Turbidity
# and Fecal Coliform level of the sample as well as with the date and location info.

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# I, first start by importing necessary py libs and opening the data 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool

import os
wqdata = pd.read_csv('../input/watershed-water-quality-data.csv')
wqdata.columns

In [None]:
# Here we look at the descriptive statistics on the data exploring the 
# data range, averages and sample counts of only numeric data (so it excludes date and location)
wqdata.describe()

In [None]:
# To examine data, we could use .head() or .tail() for instance
wqdata.head()

In [None]:
# .info shows data types and names of the columns with data range
# it is also possible to see the missing data in each data attribute (columns)
wqdata.info()

In [None]:
# Here, we produce a correlation matrix of each attribute to see 
# if there is a direct relationship among turbidity and fecal coliform (FC) levels

wqdata.corr()

# We can say that 4-hour-turbidity samples shows high correlation with previous and next samples
# All 4-hour-turbidity samples also highly correlated with daily turbidity

In [None]:
# By using seaborn library's correlation map function we build a color - coded correlation matrix 

f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(wqdata.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

# Here we have 4-hour-turbidity (interval) samples and daily turbidity average
# It is possible to say the 4PM has the least effect on daily turbidity average 
# and other interval samplings since it has the lowest correlation with the others

In [None]:
wqdata.columns

In [None]:
# To see that better we will look at the scatter plots of interval 
# samples against daily average of turbidity

# Scatter Plots will be set as x = Daily Turbidity Average and y = interval sampling 

wqdata.plot(kind='scatter', marker='+', grid=True, x='Average 24hrTurbidity(NTU)', y='Turbidity(NTU) at 12AM',alpha = 0.5,color = 'orange')
plt.xlabel('Average 24hrTurbidity(NTU)') 
plt.ylabel('Turbidity(NTU) at 12AM')
plt.title('12 AM vs Daily Turbidity Scatter Plot')

wqdata.plot(kind='scatter', marker='+', grid=True, x='Average 24hrTurbidity(NTU)', y='Turbidity(NTU) at 4AM',alpha = 0.5,color = 'orange')
plt.xlabel('Average 24hrTurbidity(NTU)') 
plt.ylabel('Turbidity(NTU) at 4AM')
plt.title('4 AM vs Daily Turbidity Scatter Plot')

wqdata.plot(kind='scatter', marker='+', grid=True, x='Average 24hrTurbidity(NTU)', y='Turbidity(NTU) at 8AM',alpha = 0.5,color = 'orange')
plt.xlabel('Average 24hrTurbidity(NTU)') 
plt.ylabel('Turbidity(NTU) at 8AM')
plt.title('8 AM vs Daily Turbidity Scatter Plot')

wqdata.plot(kind='scatter', marker='+', grid=True, x='Average 24hrTurbidity(NTU)', y='Turbidity(NTU) at 12PM',alpha = 0.5,color = 'orange')
plt.xlabel('Average 24hrTurbidity(NTU)') 
plt.ylabel('Turbidity(NTU) at 12PM')
plt.title('12 PM vs Daily Turbidity Scatter Plot')

wqdata.plot(kind='scatter', marker='+', grid=True, x='Average 24hrTurbidity(NTU)', y='Turbidity(NTU) at 4PM',alpha = 0.5,color = 'orange')
plt.xlabel('Average 24hrTurbidity(NTU)') 
plt.ylabel('Turbidity(NTU) at 4PM')
plt.title('4 PM vs Daily Turbidity Scatter Plot')

wqdata.plot(kind='scatter', marker='+', grid=True, x='Average 24hrTurbidity(NTU)', y='Turbidity(NTU) at 8PM',alpha = 0.5,color = 'orange')
plt.xlabel('Average 24hrTurbidity(NTU)') 
plt.ylabel('Turbidity(NTU) at 8PM')
plt.title('8 PM vs Daily Turbidity Scatter Plot')

**Bu kısmı düzenleyerek tüm veri serilerini tek plota çizdirelim**

index = [0,1,2,3,4,5]
markers=['.','x','+','-','*','~',]
colors=['orange','blue','green','purple','red','yellow']
intervals = ['Turbidity(NTU) at 12AM', 'Turbidity(NTU) at 4AM','Turbidity(NTU) at 8AM','Turbidity(NTU) at 12PM','Turbidity(NTU) at 4PM', 'Turbidity(NTU) at 8PM']

for i in index:
    data.plot(kind='scatter', marker=markers[i], grid=True, x='Average 24hrTurbidity(NTU)', y=intervals[i], alpha = 0.5,color = colors[index]
              
plt.xlabel('Average 24hrTurbidity(NTU)') 
plt.ylabel('4-hour-Turbidity(NTU)')
plt.title('4 PM vs Daily Turbidity Scatter Plot')

In [None]:
# This section covers some plots by using Matplotlib library
# such as line, scatter and histogram plots

# Line Plot 
wqdata['Average 24hrTurbidity(NTU)'].plot(kind='line', y='Average 24hrTurbidity(NTU)', x='Date', color='r', label='Turbidity Daily Avg.', linewidth=1, alpha=0.7, grid=True, linestyle='-')
wqdata['Turbidity(NTU) at 12PM'].plot(color='b', y='Turbidity(NTU) at 12PM', x='Date', label='Turbidity at 12PM', linewidth=1, alpha=0.7, grid=True, linestyle='-')

plt.legend(loc='upper right')     # legend = puts label into plot
#plt.xlabel('Date')              # label = name of label
#plt.ylabel('Turbidity (NTU)')
plt.title('Line Plot')            # title = title of plot
plt.show

In [None]:
wqdata.columns

In [None]:
wqdata.head()

In [None]:
wqdata.plot(kind='scatter', x='Average 24hrTurbidity(NTU)', y='Turbidity(NTU) at 12PM', color='red', alpha=0.2, grid=True)
plt.show()

In [None]:
wqdata['Average 24hrTurbidity(NTU)'].plot(kind='hist', color='blue', label='Turbidity Daily Avg.', bins=50, figsize=(15,15), alpha=0.7, grid=True)

plt.show

In [None]:
# We will create and play with pandas library using our data 
wqdata

In [None]:
wqdata[:6]

In [None]:
wqdata[2:6]

In [None]:
import pandas as pd 
wqdata = pd.read_csv('../input/watershed-water-quality-data.csv')

# We'll define two columns as separate data series 
# which are dates and corresponding turbidity avegare values
dates=wqdata['Date']
turbidity=wqdata['Average 24hrTurbidity(NTU)']

print(dates)
print(turbidity)

In [None]:
print(dates>'2018-12-12T00:00:00')

In [None]:
print(turbidity<0.85)

In [None]:
# Now we define a filter to see the data exactly we wanted to see
date_filter = wqdata['Date']>'2018-12-27T00:00:00' # Last four days of the dataset
wqdata[date_filter]

In [None]:
val_filter = wqdata['Average 24hrTurbidity(NTU)']>0.55
wqdata[val_filter]
# Now we got the turbidity values which are greater than 0.95
# 1440 rows to be exact, as noted in the bottom

In [None]:
# Now we use numpy logical_and operator to combine both filter 
wqdata[np.logical_and(wqdata['Date']>'2018-12-27T00:00:00', wqdata['Average 24hrTurbidity(NTU)']>0.55)]
# We combined the filters and got data collected after specified date 
# and having values higher than 0.55

In [None]:
#gives same output 
wqdata[(wqdata['Date']>'2018-12-27T00:00:00')&(wqdata['Average 24hrTurbidity(NTU)']>0.55)]

In [None]:
wqdata.loc[3:10, 'Date']

In [None]:
wqdata.loc[3:10, ['Date','Turbidity(NTU) at 8AM', 'Turbidity(NTU) at 8PM']

In [None]:
# Data Cleaning
wqdata.head(3)

In [None]:
wqdata.shape

In [None]:
wqdata.info()

In [None]:
# We'd like to know the frequency of different values for a data column (attribute).
# This is why we'd use .value_counts() method 
# This method is good to produce data frequency and visualize it by using histogram plot
print(wqdata['Average 24hrTurbidity(NTU)'].value_counts(dropna=False)) #dropna property is for considering NaN / null values

In [None]:
wqdata.describe()

In [None]:
wqdata.boxplot()

In [None]:
wqdata.boxplot(column='Average 24hrTurbidity(NTU)')

In [None]:
# Data Melting 

# Melting a dataset actually unpivots it. This means, by melting a data, 
# we create a new dataset, insert entries by using the identifier value (ID column).
# Identifier value is used in a new entry for each of the value column we choose. 
# For instance, we melt wqdata dataframe around 'Date' column and choose 
# 'Turbidity(NTU) at 12AM', 'Turbidity(NTU) at 12PM', and 'Average 24hrTurbidity(NTU)'
# columns as value columns, we'll get 3x rows for the new dataframe since we'll see
# a new row of Turbidity values of a date.

In [None]:
wqdata.columns

In [None]:
melted = pd.melt(frame=wqdata, id_vars='Date', value_vars=['Turbidity(NTU) at 12AM', 'Turbidity(NTU) at 12PM', 'Average 24hrTurbidity(NTU)'])
melted

In [None]:
melted.pivot(index='Date', columns='variable', values='value') 

# this code should re-pivot selected four columns though it throws the error below
# the reason is index column in our data has duplicates which should not be. 
# there is no way to build a column with unique values within this dataset right now 
# without changing the values or combining columns etc. 

In [None]:
# Data Concatenation 

# This can bw done in two ways: 
# 1) Vertical which glues rows of two datasets with same number of columns,  pd.concat([dataset1, dataset2], axis=0, ignore_index=True)
# 2) Horizontal which glues columns of two datasets with same number of rows,  pd.concat([dataset1, dataset2], axis=1)

dataset1 = wqdata.head()
dataset2 = wqdata.tail()

vconcat = pd.concat([dataset1, dataset2], axis=0, ignore_index=True) # ignore_index=True re-index all data rows
vconcat

In [None]:
dataset1 = wqdata.head()
dataset2 = wqdata.tail()

hconcat = pd.concat([dataset1.Date, dataset1['Turbidity(NTU) at 12AM'], dataset1['Turbidity(NTU) at 12PM']], axis=1)
hconcat

In [None]:
# Our data could have columns with wrong data types
# When we examine it with .dtypes attribute we see that
# Coliform data is inputted as string

wqdata.dtypes

In [None]:
wqdata.head()
# We can transform data types between object - categorical and float - integer
# False definitions such as string-integer data could be transformed by using astype('integer') as well 

In [None]:
wqdata['Site'] = wqdata['Site'].astype('category')
wqdata['NewAvg'] = wqdata['Average 24hrTurbidity(NTU)'].astype('int32')
#wqdata.dtypes
wqdata['NewAvg']

In [None]:
# Missing Data Problem 

# 1. Leave 2.dropna(), 3.fillna(), 4.fill with statistics

wqdata.info()

In [None]:
wqdata['Turbidity(NTU) at 12AM'].value_counts(dropna=False) 
# We check value counts of each unique value in 'Turbidity(NTU) at 12AM' dataset 
# We DO count NaN / null values also since dropna=False

In [None]:
# Creating a new dataset by eliminating NaN values
wqdataComplete=wqdata.copy()
wqdataComplete['Turbidity(NTU) at 12AM'].dropna(inplace=True)
wqdataComplete['Turbidity(NTU) at 12AM'].value_counts(dropna=False) 

In [None]:
assert wqdata['Turbidity(NTU) at 12AM'].notnull().all() # Returns error since wqdata has NaN values

In [None]:
assert wqdataComplete['Turbidity(NTU) at 12AM'].notnull().all() # Returns nothing (which means it is true) since wqdataComplete has no NaN values

In [None]:
wqdataComplete['Turbidity(NTU) at 12AM'].fillna('empty', inplace=True)
wqdataComplete['Turbidity(NTU) at 12AM'].value_counts()

In [None]:
wqdataCompleteNan = wqdata.copy()
wqdataCompleteNan['Turbidity(NTU) at 12AM'].fillna('empty', inplace=True)
wqdataCompleteNan['Turbidity(NTU) at 12AM'].value_counts() 

# Now we copied a new dataframe and replace NaN values with 'empty'
# Value_counts() gives us the count of new 'empty' rows

In [None]:
# Now we create to lists called "Months" and "Seasons"
Months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
Seasons = ["Winter", "Spring", "Summer", "Autumn"]

MonthNum = []
for i in range(12):
    MonthNum.append(i)
zipMonths = dict(zip(MonthNum, Months))
zipMonths

In [None]:
Seasons = ["Winter", "Spring", "Summer", "Autumn"]
Seasons

In [None]:
Seasons = ["Winter", "Spring", "Summer", "Autumn"]
NewSeasons = []
i = 0 #Season[i]
while i < len(Seasons):
    j = 0 #InsertPos
    while j < 3:
        NewSeasons.insert(3*i+j, Seasons[i])
        j += 1
    i += 1
NewSeasons
NewSeasons.append(NewSeasons[0])
NewSeasons.pop(0)
NewSeasons

In [None]:
MonthsSeason = dict(zip(Months, NewSeasons))
MonthsSeason

In [None]:
# Now we create a new column called "Sampling Season" and assign a default value "Winter" to the whole column
wqdata['Sampling Season']="Winter"
wqdata['Sampling Month']=wqdata.Date[:3]
wqdata.head()


In [None]:
# Subplots 

wqdata.plot(subplots=True, figsize=(30,30))
plt.show()

In [None]:
wqdata.columns

In [None]:
# Renaming column names

wqdata.rename(columns={'Site':'site', 'Date':'date', 'Turbidity(NTU) at 12AM':'turb12am', 'Turbidity(NTU) at 4AM':'turb4am',
                       'Turbidity(NTU) at 8AM':'turb8am', 'Turbidity(NTU) at 12PM':'turb12pm','Turbidity(NTU) at 4PM':'turb4pm', 
                       'Turbidity(NTU) at 8PM':'turb8pm', 'Average 24hrTurbidity(NTU)':'turbavg', 'Coliform, Fecal(fc/100mL)':'coliform',
                       'Sampling Season':'samplingseason', 'Sampling Month':'samplingmonth'}, inplace=True)
wqdata.info()

In [None]:
# Changing date column's data type to 'datetime'

wqdata.date = pd.to_datetime(wqdata.date)
wqdata = wqdata.set_index("date")
wqdata.head()

In [None]:
print(wqdata.loc['2015-05-11':'2015-05-30'])

In [None]:
wqdata.columns

In [None]:
wqdata.resample('M').mean().head(15) # Shows a monthly mean values (limited with the first 15 months) of each column in the dataframe 

In [None]:
wqdata.resample('A').max() # Shows a annual mean values of each column in the dataframe 

**Indexing Data Frames**

#1. Indexing using square brackets // *wqdata['turb12am'][2:12]*
#2. Using column attribute and row label
#3. Using loc accessor
#4. Selecting only some columns


In [None]:
df1 = wqdata['turb12am'][2:12]
df1 = wqdata.reset_index()
#df1 = wqdata.set_index("date")
df1

In [None]:
df2 = wqdata.turb12am[2:12]
df2 = wqdata.reset_index()
df2

In [None]:
df3 = wqdata.loc['2018-12-31', ['turb12am']]
df3

In [None]:
df4 = wqdata[['turb4am', 'turb4pm']]
df4


In [None]:
df6=wqdata.loc['2018-12-27':,'turb4am':'turb4pm']
df6

In [None]:
df7=wqdata.loc['2018-12-27'::-1,'turb4am':'turb4pm'] #reverse
df7

In [None]:
# Filtering Data Frame 

flt1 = wqdata.turb4am > 1.20
df11 = wqdata[flt1]
df11

In [None]:
flt1 = wqdata.turb4am > 1.20
flt2 = wqdata.turb8am > 1.30
df12 = wqdata[flt1 & flt2]
df12

In [None]:
flt1 = wqdata.turb4am > 1.20
df13 = wqdata.turb4am[wqdata.turb4am > 1.20]
df13
# df14 = wqdata.turb4am[flt1]
# df14 

In [None]:
# Transformation Function 

# We define a function that makes a unit conversion here 
# Create a new column and assigned each cell the value calculated with the defined function 

def turbidity_unittransf(x):
    return x**2+0.33
wqdata['mew_turb4am'] = wqdata.turb4am.apply(turbidity_unittransf)
wqdata.head()

In [None]:
# Lambda function definition is a shorter way to do the same thing

wqdata['mew_turb8am'] = wqdata.turb8am.apply(lambda z : z**2+0.33)
wqdata.head()

In [None]:
print(wqdata.index.name)

In [None]:
wqdata.index.name = 'index_date'
wqdata.head()

In [None]:
wqdata.turb12am.rename = 'turb_12am'
wqdata.head()

In [None]:
x = range(3, 20, 3)
         
for n in x:
  print(n)

In [None]:
wqdata['no']=range(1,1459,1)
wqdata=wqdata.set_index('no')
wqdata.tail()

In [None]:
# Group By 
wqdata.describe()

# turbavg column has changing values between 0.45 and 1.43
# lets create three intervals called low (<= 0.8), mid (0.8 - 1.2) and high (=>1.2)

def classify(x):
    level=''
    if (x<=0.8):
        level='low'
    elif (x>0.8 and x<1.2):
        level='mid'
    else:
        level='high'
    return level

wqdata['turbiditylevel'] = wqdata.turbavg.apply(classify)
wqdata.head(10)

In [None]:
# After classified our data by adding turbidity level column, we can use it for
# aggregating and producing further statistics based on the 'group by'

wqdata.groupby('turbiditylevel').mean() # We found mean of each turbidity level. We cam even find using min, max or std

In [None]:
# We filter the previous output with one column
wqdata.groupby('turbiditylevel').turb12pm.mean()

In [None]:
# ...or more columns
wqdata.groupby('turbiditylevel')[["turb12am","turb12pm"]].mean()