# Machine learning Algorithms using Scikit-Learn

## 1. Regression Data Preparation

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.datasets import *

In [43]:
# Load the boston housing prices dataset.
boston = (load_boston())

In [44]:
# 'boston' is a bunch object (similar to a dictionary). The important attributes are 'data' and 'target'.
# By convention, X always denotes the features, and y always denotes the label / target.
X, y = boston['data'], boston['target']

In [45]:
# It is always a good practice to see the shapes of the features and the target(s).
print (X.shape, y.shape)

(506, 13) (506,)


In [46]:
# Split the data into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20)

print ("Train Set : ", X_train.shape, y_train.shape)
print ("Test Set : ", X_test.shape, y_test.shape)

Train Set :  (404, 13) (404,)
Test Set :  (102, 13) (102,)


In [47]:
# It is always a good practice to scale the data. (Mean - Var Normalization)

# Create a standard scaler object.
scaler = StandardScaler()

# Fit the data and then transform the data. 
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

## 2. Regression based algorithms

### 2.1 Linear Regression

In [30]:
# Import the linear regression model.
from sklearn.linear_model import LinearRegression

In [32]:
# Create an object.
linReg = LinearRegression()

# Print all the parameters of the model.
print (linReg)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)


In [50]:
# Fit the training data to the linear regressor.
linReg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [52]:
# Now, that the model is trained, you can test it on the test data.
modelPreds = linReg.predict(X_test)

In [57]:
# This prints the coefficient of each feature in the data.
linReg.coef_

array([-0.98973009,  1.22058631,  0.1388396 ,  0.77299199, -1.98922958,
        2.4984222 ,  0.21123651, -3.04510629,  3.0791412 , -2.39697268,
       -1.81355479,  0.81500982, -4.18583126])

In [55]:
"""
    To see how good the model is, we use a variety of metrics. R2 score is a commonly used metric for regression 
    use-cases. You'll come across it in module 2, and it'll be explained in class. It is between 0 - 1. The closer
    the score is to 1, the better model is.
"""
print (r2_score(y_test, modelPreds))

0.7129214672932727


### 2.2 Decision Tree Regressor

In [59]:
# Import the model.
from sklearn.tree import DecisionTreeRegressor

In [60]:
# Create an object.
decTreeReg = DecisionTreeRegressor()

# Print all the parameters of the model.
print (decTreeReg)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')


In [61]:
# Fit the training data to the model.
decTreeReg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [62]:
# Now, that the model is trained, you can test it on the test data.
modelPreds = decTreeReg.predict(X_test)

### 2.3 Ridge Regression

In [64]:
# Import the model.
from sklearn.linear_model import Ridge

In [65]:
# Create an object.
ridgeReg = Ridge()

# Print all the parameters of the model.
print (ridgeReg)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)


In [66]:
# Fit the training data to the model.
ridgeReg.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [67]:
# Now, that the model is trained, you can test it on the test data.
modelPreds = ridgeReg.predict(X_test)

In [68]:
# Understand the performance of the model.
print (r2_score(y_test, modelPreds))

0.7140469374903826


### 2.4 Lasso Regressor

In [70]:
# Import the model.
from sklearn.linear_model import Lasso

In [71]:
# Create an object.
lassoReg = Lasso()

# Print all the parameters of the model.
print (lassoReg)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)


In [72]:
# Fit the training data to the model.
lassoReg.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [73]:
# Now, that the model is trained, you can test it on the test data.
modelPreds = lassoReg.predict(X_test)

### 2.4 Support Vector Regressor

In [77]:
# Import the model.
from sklearn.svm import SVR

In [78]:
# Create an object.
svr = SVR()

# Print all the parameters of the model.
print (svr)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)


In [79]:
# Fit the training data to the model.
svr.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [80]:
# Now, that the model is trained, you can test it on the test data.
modelPreds = svr.predict(X_test)

In [81]:
# Understand the performance of the model.
print (r2_score(y_test, modelPreds))

0.6843618941586522


## 3. Classification Data Preparation

In [150]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [151]:
data = load_iris()

In [152]:
X, y = data['data'], data['target']

In [153]:
# Train test split - 75:25 rule.
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, shuffle = True)

In [154]:
# Scale the train and test data. (Mean - Var normalization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [155]:
print ("Train Set : ", X_train.shape, y_train.shape)
print ("Test Set : ", X_test.shape, y_test.shape)

Train Set :  (112, 4) (112,)
Test Set :  (38, 4) (38,)


## 4. Classification Based Algorithms

### 4.1 Logistic Regression

In [156]:
# Import the linear regression model.
from sklearn.linear_model import LogisticRegression

In [157]:
# Create an object.
logReg = LogisticRegression()

# Print all the parameters of the model.
print (logReg)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [158]:
# Fit the training data to the linear regressor.
logReg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [159]:
# Now, that the model is trained, you can test it on the test data.
modelPreds = logReg.predict(X_test)

In [160]:
# This prints the coefficient of each feature in the data.
# When the number of classes (C) is greater than 1, then the shape is [no. of classes, no. of features]
logReg.coef_

array([[-0.80694127,  1.30322683, -1.61610659, -1.43562319],
       [ 0.13105422, -1.22884593,  0.7674263 , -0.70455729],
       [ 0.15175132,  0.01109905,  1.54181019,  2.200317  ]])

In [161]:
# Mean accuracy score on the test data. In reality, a 100% accuracy is never achieved!!
logReg.score(X_test, y_test) 

0.9473684210526315

### 4.2 Naive Bayes

In [168]:
# Import the linear regression model.
from sklearn.naive_bayes import GaussianNB

In [169]:
# Create an object.
naiveBayes = GaussianNB()

# Print all the parameters of the model.
print (naiveBayes)

GaussianNB(priors=None, var_smoothing=1e-09)


In [170]:
# Fit the training data to the linear regressor.
naiveBayes.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [171]:
# Now, that the model is trained, you can test it on the test data.
modelPreds = naiveBayes.predict(X_test)

In [172]:
# Mean accuracy score on the test data. In reality, a 100% accuracy is never achieved!!
np.sum((modelPreds == y_test)) / (y_test.shape[0])

0.9473684210526315

### 2.2 Decision Tree Classifier

In [173]:
# Import the model.
from sklearn.tree import DecisionTreeClassifier

In [179]:
# Create an object.
# Intentionally setting the depth of the tree to be 2. (no reason! just did it.)
decTreeCla = DecisionTreeClassifier(max_depth = 2)

# Print all the parameters of the model.
print (decTreeCla)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


In [180]:
# Fit the training data to the model.
decTreeCla.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [181]:
# Now, that the model is trained, you can test it on the test data.
modelPreds = decTreeCla.predict(X_test)

In [182]:
# Mean accuracy score on the test data. In reality, a 100% accuracy is never achieved!!
np.sum((modelPreds == y_test)) / (y_test.shape[0])

0.9210526315789473

### 2.4 Support Vector Classifier

In [183]:
# Import the model.
from sklearn.svm import SVC

In [184]:
# Create an object.
svc = SVC()

# Print all the parameters of the model.
print (svc)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [185]:
# Fit the training data to the model.
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [186]:
# Now, that the model is trained, you can test it on the test data.
modelPreds = svc.predict(X_test)

In [187]:
# Mean accuracy score on the test data. In reality, a 100% accuracy is never achieved!!
np.sum((modelPreds == y_test)) / (y_test.shape[0])

0.9736842105263158