# IRIS dataset -- A simple data analysis

## Data:
 The dataset for this analysis is taken from Kaggle website. You can also download from there.
 https://www.kaggle.com/uciml/iris/kernels

In [124]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings("ignore")

## Read the dataset

In [125]:
df_iris = pd.read_csv(r'C:\Users\Srividhya\Desktop\IRIS\Iris.csv')

In [126]:
df_iris.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [127]:
df_iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [128]:
df_iris.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [129]:
df_iris.shape

(150, 6)

## Check whether there are any missing values

In [130]:
df_iris.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

From the output, it is clearly seen that there are no missing values.

## Check whether there are categorical variables

In [131]:
df_iris.select_dtypes('object').head()

Unnamed: 0,Species
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa


The output clearly shows that only the Species column (predictor variable) is categorical.

### Choose the predictor variable and assign to y

In [132]:
y = df_iris.Species

In [133]:
y.head()

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Species, dtype: object

#### Converting the categorical variable in y to non-categorical values (numeric values)

In [134]:
y = pd.get_dummies(y)

In [135]:
y.head()

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


Now, the model can be trained and tested without any obstacle since there are no missing values and no categorical variables.

### Choosing the other columns except predictor variable as features and dataframe is created with name X

In [136]:
X = df_iris.drop(columns = ['Species'],axis = 0)

In [137]:
X.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,1,5.1,3.5,1.4,0.2
1,2,4.9,3.0,1.4,0.2
2,3,4.7,3.2,1.3,0.2
3,4,4.6,3.1,1.5,0.2
4,5,5.0,3.6,1.4,0.2


### Splitting the csv data into train and test dataset

In [138]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

In [139]:
X_train.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
61,62,5.9,3.0,4.2,1.5
92,93,5.8,2.6,4.0,1.2
112,113,6.8,3.0,5.5,2.1
2,3,4.7,3.2,1.3,0.2
141,142,6.9,3.1,5.1,2.3


In [140]:
X_val.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
114,115,5.8,2.8,5.1,2.4
62,63,6.0,2.2,4.0,1.0
33,34,5.5,4.2,1.4,0.2
107,108,7.3,2.9,6.3,1.8
7,8,5.0,3.4,1.5,0.2


## Developing a model with Random Forest model

In [141]:
model_RF = RandomForestRegressor(random_state=1)

### Fitting the model with training data

In [142]:
model_RF.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

### Making the model to predict the species of test data i.e val data

In [143]:
predictions = model_RF.predict(X_val)

In [144]:
predictions

array([[0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [0. , 0.9, 0.1],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ]])

### The predictions are tested for accuracy with mean absolute error

In [145]:
score = mean_absolute_error(y_val, predictions)

In [146]:
print(" MAE score : ", score)

 MAE score :  0.015789473684210527


#### Since the MAE score is much lesser, the errors are very low and the accurate predictions of the species of the IRIS flowers are made.