In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Feature Selection
#Univariate Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#Feature Importance
from sklearn.ensemble import ExtraTreesClassifier

#Importing alll the necessary packages to use the various classification algorithms
from sklearn.linear_model import LogisticRegression  # for Logistic Regression algorithm
from sklearn.model_selection import train_test_split #to split the dataset for training and testing
from sklearn.neighbors import KNeighborsClassifier  # for K nearest neighbours
from sklearn import svm  #for Support Vector Machine (SVM) Algorithm
from sklearn import metrics #for checking the model accuracy
from sklearn.tree import DecisionTreeClassifier #for using Decision Tree Algoithm
from sklearn.metrics import confusion_matrix #Summarises Count values of Predictions
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

In [None]:
#Let's import the required Iris.csv dataset
df=pd.read_csv("../input/iris/Iris.csv")
df.head()

In [None]:
#Let's check for Missing Values
#Let's see how many categorical and numerical variables we have in our Dataset.
df.info()

In [None]:
#let's see the stats and understand the average of all the features ad the distribution of data in percentiles.
#If its a big data set you can use BokPlots to see the density of data located in percentile and also check for Outliers.
#Since Iris is a clean and a normalized Dataset there is very little that we can do with Exploratory Data Analysis.
df.describe()

In [None]:
# Since the 'Id' column is irrelevant to our Analysis we drop the column.
df=df.drop('Id',axis=1) 
df

In [None]:
#Let's try and see how each feature is correlated with one another.
correlation=df.corr()
print(correlation)

In [None]:
# Heat maps are great for making trends in this kind of data more readily apparent. 
# Particularly when the data is ordered and there is clustering.
plt.figure(figsize=(5,5))
sns.heatmap(correlation, annot=True,cmap="YlGnBu")
plt.show()

In [None]:
#A pairplot plot a pairwise relationships in a dataset.
sns.pairplot(df, size=2.5, hue="Species")
plt.show()

# Feature Selection

Univariate Selection

Statistical tests can be used to select those features that have the strongest relationship with the output variable.
The scikit-learn library provides the SelectKBest class that can be used with a suite of different statistical tests to select a specific number of features.

In [None]:
X = df.iloc[:,0:4]  #independent columns
Y = df.iloc[:,-1]    #target column i.e Species
print("Feature Variable X:","\n",X,"\n"*2,"Target Variable Y:","\n",Y, )

In [None]:
bestfeatures = SelectKBest(score_func=chi2, k=3)
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
print(featureScores)

In [None]:
featureScores.plot(kind='barh')
plt.show()

Feature Importance

You can get the feature importance of each feature of your dataset by using the feature importance property of the model.
Feature importance gives you a score for each feature of your data, the higher the score more important or relevant is the feature towards your output variable.
Feature importance is an inbuilt class that comes with Tree Based Classifiers.

In [None]:
model = ExtraTreesClassifier()
model.fit(X,Y)

In [None]:
print(model.feature_importances_)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)

In [None]:
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

# Classification

Classification can be performed on structured or unstructured data. Classification is a technique where we categorize data into a given number of classes. The main goal of a classification problem is to identify the category/class to which a new data will fall under. In this case, wea re classifying the Data under the Classes i.e., Species.





a confusion matrix will summarize the results of testing the algorithm for further inspection. It Summarises Count values of Predictions in each model.

#Split the Data Train and test

In [None]:
train, test = train_test_split(df, test_size = 0.3)
print(train.shape)
print(test.shape)

In [None]:
train_X=train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
train_Y=train['Species']
test_X= test[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
test_Y =test['Species']

**#Support Vector Machine (SVM)**

Support vector machine is a representation of the training data as points in space separated into categories by a clear gap that is as wide as possible. New examples are then mapped into that same space and predicted to belong to a category based on which side of the gap they fall.

In [None]:
SVM = svm.SVC()
SVM.fit(train_X,train_Y)
prediction=SVM.predict(test_X)
print('The accuracy of the SVM is:',metrics.accuracy_score(prediction,test_Y))

**#Confusion matrix**

In [None]:
Y_pred=SVM.predict(test_X)
Y_true=test_Y
cm=confusion_matrix(Y_true,Y_pred)
f, ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Y_pred")
plt.ylabel("Y_true")
plt.show()

**#Logistic Regression**

In the Logistic Regression algorithm, the probabilities describing the possible outcomes of a single trial are modelled using a logistic function.

In [None]:
LR = LogisticRegression()
LR.fit(train_X,train_Y)
prediction=LR.predict(test_X)
print('The accuracy of the Logistic Regression is',metrics.accuracy_score(prediction,test_Y))

**#Confusion matrix**

In [None]:
Y_pred=LR.predict(test_X)
Y_true=test_Y
cm=confusion_matrix(Y_true,Y_pred)
f, ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Y_pred")
plt.ylabel("Y_true")
plt.show()

**#Decision Tree Classifier**

Given a data of attributes together with its classes, a decision tree produces a sequence of rules that can be used to classify the data.

In [None]:
DTC=DecisionTreeClassifier()
DTC.fit(train_X,train_Y)
prediction=DTC.predict(test_X)
print('The accuracy of the Decision Tree is',metrics.accuracy_score(prediction,test_Y))

#Confusion matrix

In [None]:
Y_pred=DTC.predict(test_X)
Y_true=test_Y
cm=confusion_matrix(Y_true,Y_pred)
f, ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Y_pred")
plt.ylabel("Y_true")
plt.show()

**K-Nearest Neighbours**

Neighbours based classification is a type of lazy learning as it does not attempt to construct a general internal model, but simply stores instances of the training data. Classification is computed from a simple majority vote of the k nearest neighbours of each point.

In [None]:
KNN=KNeighborsClassifier(n_neighbors=3) #this examines 3 neighbours for putting the new data into a class
KNN.fit(train_X,train_Y)
prediction=KNN.predict(test_X)
print('The accuracy of the KNN is',metrics.accuracy_score(prediction,test_Y))

**#Confusion matrix**

In [None]:
Y_pred=KNN.predict(test_X)
Y_true=test_Y
cm=confusion_matrix(Y_true,Y_pred)
f, ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Y_pred")
plt.ylabel("Y_true")
plt.show()

# Wrapping Up!

For a starter in Data Science, Iris Dataset is a good place to understand Classsification problems. 

The Feature Selection methods that you see i.e, Univariate Selection and Feature Importance are to give you an Idea about selecing best features that fit your model. 

I hope the Simple data visualization techinques and Machine Learning Algorithims used above helped you understand the lifecycle of Data Science, you can create with pandas, seaborn, and matplotlib in Python and Sklearn!

I encourage you to run through these examples yourself.

if you liked it pleaase upvote. I will post more kernels for Data Science Beginners.
