In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot

# from pandas import read_csv
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read the data set... read_csv(path)
path = '../input/iriscsv/Iris.csv'
headernames = ['sr.no','SepLen','SepWid','PetLen','PetWid','TypeofFlower']
data = pd.read_csv(path, names=headernames)  # custom headers
#read the data from CSV and store it in the dataFrame
data1 = pd.read_csv(path, header=0)  #header indicates the row number to be included for getting attribute names from dataset

In [None]:
# looking into the data
data1

In [None]:
#getting insights of dataset
data1.info()

In [None]:
# to know th shape of data (rows and columns)
data1.shape

In [None]:
#looking at Top five...

data1.head()   # arguments are deciding no. of rows. by default it is 5 

# similary we can use tail() to get rows from bottom

In [None]:
#to know the datatypes of each attribute in dataset
print(data1.dtypes)

In [None]:
# to get statistical summary of the data...
pd.set_option('display.width', 100)
pd.set_option('precision', 2)
data1.describe()

In [None]:
#to find the unique/distinct values of attribute 
count_class = data1.groupby('Species').size()
print(count_class)

In [None]:
# to understand relationship among attributes we use correlation
correlations = data1.corr(method='pearson')
print(correlations)

Here given given data set is having major positive correlation co-efficeints which indicates positive relationship among input attributes. Sepal length is highly positivly correlated with petallength. Petal length and Petal width are the highest correlated attributes.

In [None]:
# to find the skewness of the given data
print(data1.skew())

From the symmetricity perspective we can see that sepallength and sepalwidth are positivly skewed data where petallength and petalwidth are negetively skewed data

In [None]:
# to observe the attributes graphically with histogram
data1.hist()
pyplot.show() # to show the graph

In [None]:
# density plot
data1.plot(kind='line', subplots=True, layout=(3,2), sharex=False)
# kind -- type of plot we want to draw
#subplots -true/false ----> True - allow us to draw subplots
#layout - subplots alignment
#sharex ,sharey --- axis sharing
pyplot.show()

In [None]:
#Box and Whisker Plots --> to understand the skeness in the attribute data...
data1.plot(kind='box', subplots=True, layout=(3,2), sharex=False)
# kind -- type of plot we want to draw
#subplots -true/false ----> True - allow us to draw subplots
#layout - subplots alignment
#sharex ,sharey --- axis sharing
pyplot.show()

In [None]:
data1.describe()

In [None]:
# TO check/visaulize individual attribure
data1['SepalLengthCm'].plot(kind='density')

#data1['PetalLengthCm'].plot(kind='box')

In [None]:
# Plotting the correlation matrix plot

fig = pyplot.figure()  # instance of a figure
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
names = ['SL','SW','PL','PW']
fig.colorbar(cax)
ticks = np.arange(0,4,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
pyplot.show()

In [None]:
import seaborn as sns

sns.heatmap(correlations, annot=True)
pyplot.show()

In [None]:
# showing Scatter plot

sns.pairplot(data1)

# **Data Preprocessing**

**1. Scaling**

In [None]:
# using MinMaxScaler from sklearn->preprocessing->MinMaxScaler()

from sklearn import preprocessing
data1 = pd.read_csv(path, header=0)
#print(data1)

#to select specific column from dataframe use [[ ]] double square brackets.
data2 = data1[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
# print(data2)

array = data2.values
print(array[:10])

#creating the scaler function with MinMaxScaler
data_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))

#applying scaler to the data using fit_transform
data_rescaled = data_scaler.fit_transform(array)

#setting precision of the scaled values
np.set_printoptions(precision=2)
print(data_rescaled[:10])

**2. Noramlization**

**1. L1 Normalization**
* It may be defined as the normalization technique that modifies the dataset values in a way that in each row the sum of the absolute values will always be up to 1. It is also called Least Absolute Deviations.

In [None]:
# using Normalizer from sklearn->preprocessing->Noramlizer()->L1,L2,....

from sklearn.preprocessing import Normalizer

data1 = pd.read_csv(path, header=0)
#print(data1)

#to select specific column from dataframe use [[ ]] double square brackets.
data2 = data1[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
# print(data2)

array = data2.values
print('-----------Original Data-----------')
print(array[:10])
print('-----------Original Data-----------')
#creating and Fitting the normalizer function with L1 Normalization
data_noramlize = Normalizer(norm='l1').fit(array)

#applying Normalizer to the data using transform
data_normalized = data_noramlize.transform(array)

#setting precision of the scaled values
np.set_printoptions(precision=2)
print('-----------Normalized Data-----------')
print(data_normalized[:10])
print('-----------Normalized Data-----------')

**2. L2 Normalization**
* It may be defined as the normalization technique that modifies the dataset values in a way that in each row the sum of the squares will always be up to 1. It is also called least squares.

In [None]:
# using Normalizer from sklearn->preprocessing->Noramlizer()->L1,L2,....

from sklearn.preprocessing import Normalizer

data1 = pd.read_csv(path, header=0)
#print(data1)

#to select specific column from dataframe use [[ ]] double square brackets.
data2 = data1[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
# print(data2)

array = data2.values
print('-----------Original Data-----------')
print(array[:10])
print('-----------Original Data-----------')
#creating and Fitting the normalizer function with L2 Normalization
data_noramlize = Normalizer(norm='l2').fit(array)

#applying Normalizer to the data using transform
data_normalized = data_noramlize.transform(array)

#setting precision of the scaled values
np.set_printoptions(precision=2)
print('-----------Normalized Data-----------')
print(data_normalized[:10])
print('-----------Normalized Data-----------')

**3. Binaraization**
* As the name suggests, this is the technique with the help of which we can make our data binary. We can use a binary threshold for making our data binary. The values above that threshold value will be converted to 1 and below that threshold will be converted to 0. For example, if we choose threshold value = 0.5, then the dataset value above it will become 1 and below this will become 0. That is why we can call it binarizing the data or thresholding the data. This technique is useful when we have probabilities in our dataset and want to convert them into crisp values.

* We can binarize the data with the help of Binarizer class of scikit-learn Python library.

In [None]:
# using Normalizer from sklearn->preprocessing->Bianrizer()

from sklearn.preprocessing import Binarizer

data1 = pd.read_csv(path, header=0)
#print(data1)

#to select specific column from dataframe use [[ ]] double square brackets.
data2 = data1[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
# print(data2)

array = data2.values
print('-----------Original Data-----------')
print(array[:10])
print('-----------Original Data-----------')
#creating and Fitting the Binarizer function
data_binarize = Binarizer(threshold=3).fit(array)

#applying Binarizer to the data using transform
data_binarized = data_binarize.transform(array)

#setting precision of the binarized values
np.set_printoptions(precision=2)
print('-----------Binarized Data-----------')
print(data_binarized[:10])
print('-----------Binarized Data-----------')

**4. Standardization**
* Another useful data preprocessing technique which is basically used to transform the data attributes with a Gaussian distribution. It differs the mean and SD (Standard Deviation) to a standard Gaussian distribution with a mean of 0 and a SD of 1. This technique is useful in ML algorithms like linear regression, logistic regression that assumes a Gaussian distribution in input dataset and produce better results with rescaled data. We can standardize the data (mean = 0 and SD =1) with the help of StandardScaler class of scikit-learn Python library.

In [None]:
# using StandardScaler from sklearn->preprocessing->StandardScaler()

from sklearn.preprocessing import StandardScaler

data1 = pd.read_csv(path, header=0)
#print(data1)

#to select specific column from dataframe use [[ ]] double square brackets.
data2 = data1[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
# print(data2)

array = data2.values
print('-----------Original Data-----------')
print(array[:10])
print('-----------Original Data-----------')
#creating and Fitting the StandardScaler function
data_StandardScaler = StandardScaler().fit(array)

#applying StandardScaler to the data using transform
data_rescaled = data_StandardScaler.transform(array)

#setting precision of the scaled values
np.set_printoptions(precision=2)
print('-----------Rescaled Data-----------')
print(data_rescaled[:10])
print('-----------Rescaled Data-----------')

#converting array to dataframe
df = pd.DataFrame(data = data_rescaled)

#plotting attribure after Standard scaling
df[[1]].plot(kind='line')

4. Data Labeling
* We discussed the importance of good fata for ML algorithms as well as some techniques to pre-process the data before sending it to ML algorithms. One more aspect in this regard is data labeling. It is also very important to send the data to ML algorithms having proper labeling.

* **Label Encoding**
Most of the sklearn functions expect that the data with number labels rather than word labels. Hence, we need to convert such labels into number labels. This process is called label encoding.

In [None]:
#importing the preprocessing library
from sklearn import preprocessing

# distinct values of attribure in word form is passed as input_labels..
input_labels = ['male','female']

#creating the Labelencoder from the input labels.
encoder = preprocessing.LabelEncoder()
encoder.fit(input_labels)

# attribute values to be transformed are passed in transform function
test_labels = ['female','female','male','female','male']
encoded_values = encoder.transform(test_labels)
print("\nLabels =", test_labels)
print("Encoded values =", list(encoded_values))

#getting original word labels by applying decoding
decoded_list = encoder.inverse_transform(encoded_values)
print("\nDecoded labels =", list(decoded_list))