In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing Libraries

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data into a Dataframe

iris_df = pd.read_csv("/kaggle/input/iris-dataset/Iris.csv")

In [None]:
iris_df.head()

**Features in the Dataset :** 
* ID 
* Sepal Length 
* Sepal Width
* Petal Length
* Petal Width
* Species (Target Feature)

In [None]:
# Get the info about each feature in the dataset 

iris_df.info()

In [None]:
# Dropping off ID column as it is unnecessary

iris_df.drop(columns=['Id'], inplace=True)

In [None]:
iris_df.describe()

**Here, we can see that there is no wide difference between Mean and Max. So, the possibility for an outlier is near to none.**

In [None]:
# Checking for NULL values in the Dataset

iris_df.isnull().sum()

**No NULL values are present in the dataset.**

In [None]:
# Setting a theme for Data Visualization

sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
matplotlib.rcParams['figure.figsize']=[8,8]
matplotlib.rcParams.update({'font.size': 15})
matplotlib.rcParams['font.family']='sans-serif'
sns.set_style('darkgrid')

In [None]:
# Counting the number of records in the given species types

sns.countplot(x='Species', data=iris_df)

We have equal number of records belonging to each of the species type.

In [None]:
# Scatter plot for Sepal Length & Width between the Species 

iris_df.plot(kind='scatter', x='SepalLengthCm', y='SepalWidthCm')

To get a clearer picture of the variation in Sepal Length and Width based on the Species, we can use a Facet Grid like below.

In [None]:
sns.FacetGrid(iris_df, hue='Species', size=7).map(plt.scatter, 'SepalLengthCm', 'SepalWidthCm').add_legend()

Now we can get a better idea about the division and uniformity based on Species.

In [None]:
sns.boxplot(x='Species', y='PetalLengthCm', data=iris_df)

In the above Boxplot, we can see the distribution of Petal length for different Species.

**Generally, we can observe that Iris setosa has the lowest petal lengths, and Iris virginica having the highest petal lengths.**

To extend the above plot, we can add the individual points on top of the box plot representation.

In [None]:
new_plot = sns.boxplot(x='Species', y='PetalLengthCm', data=iris_df)
new_plot = sns.stripplot(x='Species', y='PetalLengthCm', data=iris_df, jitter=True)

In order to make this much more visually explaining, we can make use of the violin plot as below.

In [None]:
sns.violinplot(x='Species', y='PetalLengthCm', data=iris_df)

The thicker(& wider) a portion is, the more data points exist in that range. 

In [None]:
# Exploring all possible pairings of plots based on species

sns.pairplot(iris_df, hue='Species')

In [None]:
# Plotting a correlation matrix
corr = iris_df.corr()

# Generating a mask for upper triangle. "TRUE" enables it for us to hide the upper traingle.
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Creating a custom diverging colormap
cmap = sns.diverging_palette(220,10,as_cmap=True)

# Creating the heatmap
sns.heatmap(corr, mask=mask, cmap=cmap, annot=True, vmax=1, vmin=-1, center=0, square=True, cbar_kws={"shrink": .5})


In the above correlation heatmap, we can see that Sepal Length is directly proportional to both Petal Length and Petal Width.
Similarly, Petal Length is directly proportional to Petal Width. 

In [None]:
iris_df['Species'].value_counts()

In [None]:
# Preparing Data for the model
species_encoding = { 'Species' : {'Iris-setosa': 1,'Iris-versicolor': 2, 'Iris-virginica': 3}}
iris_df.replace(species_encoding, inplace=True)
iris_df

In [None]:
# Importing libraries for performing Linear Regression

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [None]:
# Separate the features from the target variable

x = iris_df.drop('Species', axis=1)
y = iris_df['Species']
print("x's dimension: ", x.shape)
print("y's dimension: ", y.shape)

In [None]:
# Splitting of Dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=1)
print("X train dimensions: ", x_train.shape)
print("X test dimensions: ", x_test.shape)
print("Y train dimensions: ", y_train.shape)
print("Y test dimensions: ", y_test.shape)

In [None]:
# Logistic Regression

logistic_reg = LogisticRegression()
logistic_reg.fit(x,y)
y_pred = logistic_reg.predict(x)
print("Accuracy of the model built is: ",(metrics.accuracy_score(y,y_pred)*100),"%")
