## Comparing Classification Methods LR-KNN-SVM-NB-RF

* [Logistic Regression Classification](#1.)
* [K-Nearest Neighbour (KNN) Classification](#2.)
* [SVM Classification](#3.)
* [Naive Bayes Classification](#4.)
* [Decision Tree Classification](#5.)
* [Random Forest Classification](#6.)

![](https://iili.io/JGe6Ss.png)


## Comparing Clustering Methods K-Means-Hierarchical

* [K-Means Clustering](#7.)
* [Hierarchical Clustering](#8.)

![](https://iili.io/JGebi7.png)


## Comparing Regression Methods (MLR-PR-SVR-DT-RF)

* [Linear Regression](#9.)
* [Polynomial Regression](#10.)
* [Support Vector Regression , Scaling](#11.)
* [Decision Tree](#12.)
* [Random Forest](#13.)


![](https://iili.io/JGkfWB.png)


In [None]:
#1. libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_iris

In [None]:
#2. data preprocessing
#2.1. data loading
df = pd.read_csv('../input/iris/Iris.csv')

df.sample(15)

In [None]:
df.describe()

In [None]:
x = df.iloc[:,1:5].values 
#y = df.iloc[:,5:].values 
y = df.Species.values

#2.2 data standardization

from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train = sc.fit_transform(x_train)
X_test = sc.transform(x_test)
print("X_train: \n", X_train, "\n")
print("X_test: \n", X_test, "\n")

# Comparing Classification Methods LR-KNN-SVM-NB-RF


<a id="1."></a> 
# Logistic Regression Classification

In [None]:
# 1. Logistic Regression
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression(random_state=0)
logr.fit(X_train, y_train)

y_pred = logr.predict(X_test)
#print(y_pred)
#print(y_test)

# print('Logistic Regression confusion matrix')
cm = confusion_matrix(y_test, y_pred)

# %% cm visualization

import seaborn as sns
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True, linewidths=0.5, linecolor="white", fmt=".0f", ax=ax, square=True, cmap="GnBu_r")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title("Logistic Regression (as classifer)\n(confusion matrix)")
plt.show()


<a id="2."></a> 
# K-Nearest Neighbour (KNN) Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1, metric='minkowski')
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
# print('KNN confusion matrix')
#print(cm)

import seaborn as sns
f, ax = plt.subplots(figsize =(5,5))
sns.heatmap(cm,annot=True, linewidths=0.5, linecolor="white", fmt=".0f", ax=ax, square=True, cmap="GnBu_r")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title("k-nearest-neighbor Classification\n(confusion matrix)")
plt.show()


<a id="3."></a> 
# SVM Classification

In [None]:
# 3. Support Vector Classifier (SVC) (SVM classifier)
from sklearn.svm import SVC
svc = SVC(kernel='poly')
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
# print('SVC confusion matrix')
#print(cm)

import seaborn as sns
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm, annot=True, linewidths=0.5, linecolor="white", fmt=".0f", ax=ax, square=True, cmap="GnBu_r")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title("Support Vector Classification\n(confusion matrix)")
plt.show()


<a id="4."></a> 
# Naive Bayes Classification

In [None]:
# 4. Naive Bayes Classification
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

cm = confusion_matrix(y_test,y_pred)
# print('GNB')

import seaborn as sns
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm, annot=True, linewidths=0.5, linecolor="white", fmt=".0f", ax=ax, square=True, cmap="GnBu_r")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title("Gaussian Naive Bayes\n(confusion matrix)")
plt.show()


<a id="5."></a> 
# Decision Tree Classification

In [None]:
# 5. Decision Tree Classification
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion='entropy')

dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

cm = confusion_matrix(y_test,y_pred)
# print('DTC')

import seaborn as sns
f, ax = plt.subplots(figsize =(5,5))
sns.heatmap(cm, annot=True, linewidths=0.5, linecolor="white", fmt=".0f", ax=ax, square=True, cmap="GnBu_r")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title("Decision Tree Classifier\n(confusion matrix)")
plt.show()


<a id="6."></a> 
# Random Forest Classification

In [None]:
# 6. Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, criterion='entropy')
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
# print('RFC')
#print(cm)

import seaborn as sns
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm, annot=True, linewidths=0.5, linecolor="white", fmt=".0f", ax=ax, square=True, cmap="GnBu_r")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title("Random Forest Classification\n(confusion matrix)")
plt.show()

In [None]:
# 7. ROC, TPR, FPR
y_proba = rfc.predict_proba(X_test)
print(y_test, len(y_test))
print(y_proba[:, 0], len(y_proba[:, 0]))

from sklearn import metrics
fpr, tpr, thold = metrics.roc_curve(y_test, y_proba[:, 0], pos_label='Iris-virginica')
print("FPR:\n", fpr)
print("\nTPR:\n", tpr)

# Comparing Clustering Methods: K-Means vs. Hierarchical

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# load the CUSTOMER dataset (gender, age, net_worth, salary)  -- musterlier is Turkish for customers
df = pd.read_csv('../input/comparingalgosedited/musteriler.csv')
df.head()

In [None]:
#X = df.drop(columns=['id','gender','age']) # leaving net_worth and salary
X = df.drop(columns=['id','gender'])
# print(X)
x1 = X.net_worth
y1 = X.salary

plt.figure(figsize=(10,6))
plt.xlabel("Net Worth")
plt.ylabel("Salary")
plt.title("Salary vs. Net Worth")
plt.scatter(x1, y1)
plt.show()
# print(df.info())


<a id="7."></a> 
# K-Means Clustering

In [None]:
from sklearn.cluster import KMeans
import matplotlib.colors

colors = {0:'red', 1:'green', 2:'blue', 3:'yellow', 4:'orange', 5:'purple', 6:'teal', 7:'gray', 8:'pink', 9:'brown'}
num_plots = len(colors)

results = []

fig, ax = plt.subplots(5, len(colors), figsize = (20, 25))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

# train num_plots (10) models with k-means++, with number of clusters varying from 1 to 10
for i in range(num_plots):
    kmeans = KMeans(n_clusters=i+1, init='k-means++', random_state=123)
    kmeans.fit(X)
    
    plt.subplot(5, 2, i+1)
    plt.scatter(x1, y1, c=[colors[i] for i in kmeans.labels_])
    plt.xlabel("net worth")
    plt.ylabel("salary")
    plt.title('k =' + str(i+1))
    #print(kmeans.labels_)
    results.append(kmeans.inertia_)

plt.show() # show all the individual subplots

plt.plot(range(1,11), results, marker="o")
plt.title("Inertia")
plt.xlabel("k")
plt.ylabel("inertia")
plt.show() # show the inertia plot

In [None]:
centers_df = X.drop(columns=['age'])
centers_df.head()

In [None]:
# change this to see centers of centroids for any particular k of this k-means clustering example
number_of_clusters=6

kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++')
kmeans.fit(centers_df)

clusters = kmeans.fit_predict(centers_df)
df["label"] = clusters

print("Clusters:\n", clusters)

In [None]:
print("kmeans cluster centers:\n", kmeans.cluster_centers_)

In [None]:
plt.figure(figsize=(15,6))
plt.scatter(df.net_worth, df.salary, c=[colors[i] for i in clusters])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], color="orange", marker='s', s=300)
plt.title("Centroids overlaid for k = " + str(number_of_clusters) + "\n[orange squares]")
plt.show()

<a id="8."></a> 
# Hierarchical Clustering

## Dendogram



In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram

merge = linkage(X, method="ward")
plt.figure(figsize=(20,10))

dendrogram(merge, leaf_rotation=90)

plt.xlabel("data points")
plt.ylabel("euclidean distance")
plt.show()

## Basic Agglomerative Algo (hierarchical clustering)

In [None]:
from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
Y_prediction = ac.fit_predict(X)

plt.figure(figsize=(12,8))
plt.scatter(X.salary, X.net_worth, s=100, c=[colors[i] for i in Y_prediction])
plt.title('Hierarchical Clustering')
plt.show()

# Comparing Regression Methods (MLR-PR-SVR-DT-RF)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import r2_score
import statsmodels.api as sm

# load 'results2' dataset (title, education_level, salary) -- maaslar is Turkish for 'results'
df = pd.read_csv('../input/comparingalgosedited/maaslar2.csv')

x = df.drop(columns=['title','salary']) # keep only education_level as x
y = df.drop(columns=['title','education_level']) # keep only salary as y
X = x.values
Y = y.values

regr_xlabel = "Education Level"
regr_ylabel = "Salary"

df.head()

In [None]:
# correlation coefficients
df.corr()

<a id="9."></a> 
# Linear Regression


In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X, Y)
model = sm.OLS(lin_reg.predict(X), X) # ordinary least squares (OLS)

model.fit().summary()

In [None]:
# R2: how well does the function fit the data? (0-1)
print("Linear R2 value:")
print(r2_score(Y, lin_reg.predict(X)))

In [None]:
plt.scatter(X, Y, color='red')
plt.plot(x, lin_reg.predict(X), color='blue')
plt.xlabel(regr_xlabel)
plt.ylabel(regr_ylabel)
plt.title("Linear Regression")
plt.show()


<a id="10."></a> 
# Polynomial Regression


In [None]:
from sklearn.preprocessing import PolynomialFeatures

# degree -- the order of the polynomial function (e.g. degree=2 would be f(x)= ax + bx^2)
poly_degree = 4

poly_reg = PolynomialFeatures(degree=poly_degree)
x_poly = poly_reg.fit_transform(X)
#print(x_poly)

lin_reg2 = LinearRegression()
lin_reg2.fit(x_poly, y)
model2 = sm.OLS(lin_reg2.predict(poly_reg.fit_transform(X)), X)

# R2: how well does the function fit the data? (0-1)
print("Polynomial R2 value:")
print(r2_score(Y, lin_reg2.predict(poly_reg.fit_transform(X)) ))

In [None]:
plt.scatter(X, Y, color='red')
plt.plot(X, lin_reg2.predict(poly_reg.fit_transform(X)), color='blue')
plt.xlabel(regr_xlabel)
plt.ylabel(regr_ylabel)
plt.title("Polynomial Regression\n(degree=" + str(poly_degree) + ")")
plt.show()

<a id="11."></a> 
# Support Vector Regression (SVR) | Scaling


In [None]:
from sklearn.preprocessing import StandardScaler

sc1 = StandardScaler()
x_scaled = sc1.fit_transform(X)
sc2 = StandardScaler()
y_scaled = sc2.fit_transform(Y)

from sklearn.svm import SVR

svr_reg = SVR(kernel='rbf') # Google 'ML SVR kernel methods' for more info
svr_reg.fit(x_scaled, y_scaled) # train the model

model3 = sm.OLS(svr_reg.predict(x_scaled), x_scaled) # predict 
model3.fit().summary()

In [None]:
# R2: how well does the function fit the data? (0-1)
print("SVR R2 value:")
print(r2_score(y_scaled, svr_reg.predict(x_scaled)) )

In [None]:
plt.scatter(x_scaled, y_scaled, color='red')
plt.plot(x_scaled, svr_reg.predict(x_scaled), color='blue')
plt.xlabel(regr_xlabel)
plt.ylabel(regr_ylabel)
plt.title("SVR Regression\n(kernel=rbf")
plt.show()

<a id="12."></a> 
# Decision Tree

* [Random Forest](#13.)


In [None]:
from sklearn.tree import DecisionTreeRegressor

r_dt = DecisionTreeRegressor(random_state=0)
r_dt.fit(X, Y)

print("Decision Tree OLS (ordinary least squares)")
model4 = sm.OLS(r_dt.predict(X), X)

model4.fit().summary()

In [None]:
# R2: how well does the function fit the data? (0-1)
print("Decision Tree R2 value:", r2_score(Y, r_dt.predict(X)))

In [None]:
Z = X + 0.5
K = X - 0.4

plt.scatter(X, Y, color='red')
plt.plot(x, r_dt.predict(X), color='blue')
plt.plot(x, r_dt.predict(Z), color='green')
plt.plot(x, r_dt.predict(K), color = 'yellow')
plt.xlabel(regr_xlabel)
plt.ylabel(regr_ylabel)
plt.title("Decision Tree (OLS)") # OLS=ordinary least squares
plt.show()


<a id="13."></a> 
# Random Forest


In [None]:
from sklearn.ensemble import RandomForestRegressor
num_decision_trees=10

rf_reg = RandomForestRegressor(n_estimators=num_decision_trees, random_state=0)
rf_reg.fit(X, Y)

print("Random Forest OLS\n(using", num_decision_trees, "decision trees)")
model5 = sm.OLS(rf_reg.predict(X), X)

model5.fit().summary()

In [None]:
plt.scatter(X, Y, color='red')
plt.plot(x, rf_reg.predict(X), color='blue')
plt.plot(x, rf_reg.predict(Z), color='green')
plt.xlabel(regr_xlabel)
plt.ylabel(regr_ylabel)
plt.title("Random Forest OLS\n(using " + str(num_decision_trees) + " decision trees)")
plt.show()

In [None]:
# R2: how well does the function fit the data? (0-1)
# Summary of all R2 scores by algorithm

print('R2 Values by Algo (\'goodness of fit\', 0-1)\n-------------------------------------------')
print("Linear R2 value:\t", r2_score(Y, lin_reg.predict((X))))
print("Polynomial R2 value:\t", r2_score(Y, lin_reg2.predict(poly_reg.fit_transform(X)) ))
print("SVR R2 value:\t\t", r2_score(y_scaled, svr_reg.predict(x_scaled)))
print("Decision Tree R2 value:\t", r2_score(Y, r_dt.predict(X)))
print("Random Forest R2 value:\t", r2_score(Y, rf_reg.predict(X)))