In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings  
warnings.filterwarnings("ignore")   # ignore warnings

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**CLASSIFICATION ALGORITHMS**

**In this kernel, 6 different classification algorithms and an evaluation method have been explained for machine learning. Scikit - learn library has been used.  
At the end of the kernel, ROC curve which is the evaluation method, has been explained.**

Classification algoritms are generally used to predict for categoric data sets.

**EXPLORATORY DATA ANALYSIS (EDA)**

In this part, we try to understand the features of data.

In [None]:
# Load the data from csv file
data = pd.read_csv('../input/StudentsPerformance.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
# Correlation
data.corr()

**Classification Algorithms**

**1. Logistic Regression**

* Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. 
* In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). 

In [None]:
# Independent variables
x = data.iloc[:, -3:]
x

In [None]:
# Dependent variable
y = data.iloc[:, 0:1]
y

In [None]:
# Dividing into the data as train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=0)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

In [None]:
# Scaling of data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(x_train)  # training and transforming from x_train
X_test = sc.transform(x_test)    # only transforming from x_test

In [None]:
# Creation of model
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0)
log_reg.fit(X_train, y_train)

In [None]:
# Prediction
y_pred = log_reg.predict(X_test)
y_pred

**Confusion Matrix**

A confusion matrix is a table that is often used to describe the performance of a classification model on a set of test data for which the true values are known. 

tp = true positive, fp = false positive, fn = false negative, tn = true negative

* tp = Prediction is positive and actual is positive.

* fp = Prediction is positive and actual is negative. (Also known as a "Type I error.")

* fn = Prediction is negative and actual is positive. (Also known as a "Type II error.")

* tn = Prediction is negative and actual is negative.


Values of diagonal of matrix always give us successful classification.

* accuracy : the percentage of correct classification for model
* sensitivity = True positive rate = tp / (tp + fn)
* specificity = tn / (tn + fp)
* precision = tp / (tp+fp)
* recall = tp / (tp+fn)
* fall-out = False positive rate = fp / (tn + fp)
* error rate = 1 - acccuracy

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('*********************')
print('*********************')
print(294/330) # Accuracy
# As it is seen below, the model has been predicted 294 values correctly from 330 values. 

**2. K-NN (K Nearest Neighborhood)**

* K-NN is one of the strongest classification algorithms. 
* It is also basic, fast and widely used. 
* In KNN, K is the number of nearest neighbors. 

In [None]:
# Creation of model
# K = 5 (Default value of the algorithm)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
knn.fit(X_train, y_train)

In [None]:
# Prediction 
y_pred2 = knn.predict(X_test)

# Confusion matrix
cm2 = confusion_matrix(y_test, y_pred2)
print(cm2)
print('*********************')
print('*********************')
print(288/330) # Accuracy
# As it is seen below, the model has been predicted 288 values correctly from 330 values. 

**3. Support Vector Machine (SVM)**

In this algorithm, the aim is to obtain linear, polynomial, gaussian or exponential function that divides into the data points with maximum margin.

Scaling, in other words standardization is very important for this method.

In [None]:
# Creation of model
from sklearn.svm import SVC
svc = SVC(kernel='linear') 
svc.fit(X_train, y_train)

In [None]:
# Prediction
y_pred3 = svc.predict(X_test)
y_pred3

In [None]:
# Confusion Matrix
cm3 = confusion_matrix(y_test, y_pred3)
print(cm3)
print('*********************')
print('*********************')
print(291/330) # Accuracy
# As it is seen below, the model has been predicted 291 values correctly from 330 values. 

**Kernel Trick**

Kernel functions such as poly, rbf are preferred especially to classify non-linear datasets.

In [None]:
# Here, Kernel has been chosen as "rbf".
svc2 = SVC(kernel='rbf') 
svc2.fit(X_train, y_train)

y_pred_3 = svc2.predict(X_test)
y_pred_3

cm_3 = confusion_matrix(y_test, y_pred_3)
print(cm_3)
print('*********************')
print('*********************')
print(292/330) # Accuracy


**4. Naive Bayes**

* This method can work with unbalanced data sets.  
* It works on Bayes theorem of probability to predict the class of unknown data set. 
* Naive Bayes model is easy to build and particularly useful for very large data sets.
* There are three types of Naive Bayes model under scikit learn library: Gaussian, Multinomial, Bernoulli.

In [None]:
# Creation of model
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [None]:
# Prediction
y_pred4 = gnb.predict(X_test)
y_pred4

In [None]:
# Confusion matrix
cm4 = confusion_matrix(y_test, y_pred4)
print(cm4)
print('*********************')
print('*********************')
print(229/330) # Accuracy

In [None]:
# Creation of model
# In Multinomial naive bayes, input x_train must be non-negative.
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train, y_train)

In [None]:
# Prediction
y_pred_4 = mnb.predict(x_test)
y_pred_4

In [None]:
# Confusion matrix
cm_4 = confusion_matrix(y_test, y_pred_4)
print(cm_4)
print('*********************')
print('*********************')
print(294/330) # Accuracy

**5. Decision Tree**

* Decision tree builds classification model in the form of a tree structure. 
* It breaks down a dataset into smaller and smaller subsets while at the same time an associated decision tree is incrementally developed. 
* The final result is a tree with decision nodes and leaf nodes. A decision node has two or more branches. Leaf node represents a classification. 
* Decision trees can handle both categorical and numerical data. 
* The core algorithm for building decision trees called ID3.
* ID3 uses Entropy and Information Gain to construct a decision tree. 
* An independent variable that provides maximum information gain, is chosen as root node and the other nodes are chosen with using same method.

In [None]:
# Creation of model
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(X_train, y_train)

In [None]:
# Prediction
y_pred5 = dtc.predict(X_test)
y_pred5

In [None]:
# Confusion matrix
cm5 = confusion_matrix(y_test, y_pred5)
print(cm5)
print('*********************')
print('*********************')
print(271/330) # Accuracy

**6. Random Forest**

* Random Forests grows many classification trees.
* To classify a new object from an input vector, put the input vector down each of the trees in the forest. 
* Each tree gives a classification, and we say the tree "votes" for that class. 
* The forest chooses the classification having the most votes (over all the trees in the forest). This is known as majority vote.

In [None]:
# Creation of model
# Criterion' s default value is "gini".
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, criterion='entropy')
rfc.fit(X_train, y_train)

In [None]:
# Prediction
y_pred6 = rfc.predict(X_test)
y_pred6

In [None]:
# Confusion matrix
cm6 = confusion_matrix(y_test, y_pred6)
print(cm6)
print('*********************')
print('*********************')
print(278/330) # Accuracy

**ROC CURVE (Receiver Operating Characteristic)**

* The ROC curve is a fundamental tool for diagnostic test evaluation.
* In a ROC curve the true positive rate (Sensitivity) is plotted in function of the false positive rate (100-Specificity) for different cut-off points of a parameter.
* Each point on the ROC curve represents a sensitivity/specificity pair corresponding to a particular decision threshold. 
* The area under the ROC curve (AUC) is a measure of how well a parameter can distinguish between two diagnostic groups (diseased/normal).
* The closer the ROC curve is to the upper left corner, the higher the overall accuracy of the test.

* fpr = False Positive Rate
* tpr = True Positive Rate

In [None]:
from sklearn import metrics

In [None]:
# ROC Curve with Random Forest Classification
y_proba_6 = rfc.predict_proba(X_test)
y_proba_6

In [None]:
fpr, tpr, thold = metrics.roc_curve(y_test, y_proba_6[:,1], pos_label='male')
print(y_test)
print(y_proba_6[:,1])
print('fpr')
print(fpr)
print('tpr')
print(tpr)

In [None]:
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.show()

**CONCLUSION**


My other kernels are here: 

https://www.kaggle.com/armagansarikey/machine-learning-1-data-preprocessing

https://www.kaggle.com/armagansarikey/machine-learning-2-prediction-algorithms

If you have any question or suggest, I will be happy to hear it.