# Basic Usage of Scikit-Learn

https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Scikit_Learn_Cheat_Sheet_Python.pdf

**`Scikit-learn`** is an open souorce Python library that implements a range of machine learning, preprocessing, cross-validation and visualization algorithms using a unified interface.

## A Basic Example

In [1]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
iris = datasets.load_iris()
temp = iris.values()
print temp[0]  # name of response variables (target)
print temp[1][0:10, :]  # Data (features)
print temp[2]  # encoded response variables
# print temp[3]  # Data description
# print temp[4]  # Feature description

['setosa' 'versicolor' 'virginica']
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1.4  0.3]
 [ 5.   3.4  1.5  0.2]
 [ 4.4  2.9  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [3]:
X, y = iris.data[:, :2], iris.target
print X.shape, y.shape
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 30) # random_state can be considered as random seed. It can also be left blank.
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(150, 2) (150,)
(112, 2) (38, 2) (112,) (38,)


In [4]:
scalar = preprocessing.StandardScaler().fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

In [5]:
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print accuracy_score(y_test, y_pred)

# Instead of return the classification result, we can also return the probability of each output
y_pred_prob = knn.predict_proba(X_test)

0.736842105263


## Loading The Data

The data needs to be numeric and stored as Numpy arrays or Scipy sparse matrices. Other types that are convertible to numeric arrays, such as Pandas DataFrame, are also acceptable.

In [6]:
import numpy as np
X = np.random.random((10, 5))
print X

# or
# X_1 = np.random.randn(10, 5)
# print X_1

y = np.array(['M','M','F','F','M','F','M','M','F','F'])

X[X<0.7] = 0
print X

[[ 0.17134802  0.22462157  0.44050492  0.68326507  0.8390625 ]
 [ 0.09041675  0.78553353  0.85010276  0.68786752  0.31374997]
 [ 0.97757077  0.00476906  0.06231877  0.69363695  0.69922874]
 [ 0.17047891  0.26793686  0.28450069  0.57112648  0.57598971]
 [ 0.21950791  0.06817814  0.35457108  0.52152105  0.60964858]
 [ 0.4693126   0.2515859   0.8000239   0.86039339  0.73899714]
 [ 0.01635587  0.2425772   0.76009782  0.7865069   0.11531259]
 [ 0.30902788  0.42191159  0.12330329  0.66787937  0.89097667]
 [ 0.910608    0.83239223  0.97092464  0.70809331  0.84331188]
 [ 0.7157078   0.57836015  0.37578307  0.60287134  0.88371438]]
[[ 0.          0.          0.          0.          0.8390625 ]
 [ 0.          0.78553353  0.85010276  0.          0.        ]
 [ 0.97757077  0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.8000239   0.86039339  0.7

## Splitting Training & Test Data

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
print X_train
print X_test
print y_train
print y_test

[[ 0.7157078   0.          0.          0.          0.88371438]
 [ 0.          0.78553353  0.85010276  0.          0.        ]
 [ 0.          0.          0.76009782  0.7865069   0.        ]
 [ 0.          0.          0.          0.          0.89097667]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.8390625 ]
 [ 0.          0.          0.8000239   0.86039339  0.73899714]]
[[ 0.97757077  0.          0.          0.          0.        ]
 [ 0.910608    0.83239223  0.97092464  0.70809331  0.84331188]
 [ 0.          0.          0.          0.          0.        ]]
['F' 'M' 'M' 'M' 'F' 'M' 'F']
['F' 'F' 'M']


## Preprocessing the Data

### Standardization

In [8]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler().fit(X_train)
standardized_X = scalar.transform(X_train)
standardized_X_test = scalar.transform(X_test)

print X_train
print standardized_X

[[ 0.7157078   0.          0.          0.          0.88371438]
 [ 0.          0.78553353  0.85010276  0.          0.        ]
 [ 0.          0.          0.76009782  0.7865069   0.        ]
 [ 0.          0.          0.          0.          0.89097667]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.8390625 ]
 [ 0.          0.          0.8000239   0.86039339  0.73899714]]
[[ 2.44948974 -0.40824829 -0.86443799 -0.63156632  0.96988195]
 [-0.40824829  2.44948974  1.2698146  -0.63156632 -1.14771814]
 [-0.40824829 -0.40824829  1.04384983  1.47974477 -1.14771814]
 [-0.40824829 -0.40824829 -0.86443799 -0.63156632  0.98728421]
 [-0.40824829 -0.40824829 -0.86443799 -0.63156632 -1.14771814]
 [-0.40824829 -0.40824829 -0.86443799 -0.63156632  0.8628849 ]
 [-0.40824829 -0.40824829  1.14408752  1.67808681  0.62310334]]


### Normalization

In [9]:
from sklearn.preprocessing import Normalizer
scalar = Normalizer().fit(X_train)
normalized_X = scalar.transform(X_train)
normalized_X_test = scalar.transform(X_test)

print X_train
print normalized_X

[[ 0.7157078   0.          0.          0.          0.88371438]
 [ 0.          0.78553353  0.85010276  0.          0.        ]
 [ 0.          0.          0.76009782  0.7865069   0.        ]
 [ 0.          0.          0.          0.          0.89097667]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.8390625 ]
 [ 0.          0.          0.8000239   0.86039339  0.73899714]]
[[ 0.62936819  0.          0.          0.          0.77710725]
 [ 0.          0.67866406  0.73444884  0.          0.        ]
 [ 0.          0.          0.69493126  0.71907617  0.        ]
 [ 0.          0.          0.          0.          1.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          1.        ]
 [ 0.          0.          0.57640288  0.61989802  0.53243419]]


### Binarization

In [10]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold = 0.0).fit(X_train)
binary_X = binarizer.transform(X_train)
print X_train
print binary_X

[[ 0.7157078   0.          0.          0.          0.88371438]
 [ 0.          0.78553353  0.85010276  0.          0.        ]
 [ 0.          0.          0.76009782  0.7865069   0.        ]
 [ 0.          0.          0.          0.          0.89097667]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.8390625 ]
 [ 0.          0.          0.8000239   0.86039339  0.73899714]]
[[ 1.  0.  0.  0.  1.]
 [ 0.  1.  1.  0.  0.]
 [ 0.  0.  1.  1.  0.]
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  1.  1.  1.]]


### Binarize Target Variables (one-hot encoding)

In [11]:
from sklearn.preprocessing import label_binarize
temp_test_y = ["a", "a", "b", "c", "a", "b"]
temp_test_y_binarized = label_binarize(temp_test_y, classes=['a','b','c'])
print temp_test_y
print temp_test_y_binarized

['a', 'a', 'b', 'c', 'a', 'b']
[[1 0 0]
 [1 0 0]
 [0 1 0]
 [0 0 1]
 [1 0 0]
 [0 1 0]]


### Encoding Categorical Features

In [12]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
print y
y= enc.fit_transform(y)
print y

['M' 'M' 'F' 'F' 'M' 'F' 'M' 'M' 'F' 'F']
[1 1 0 0 1 0 1 1 0 0]


### Imputing Missing Values

In [13]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values = 0, strategy = "mean", axis = 0)

print X_train
print imp.fit_transform(X_train)

[[ 0.7157078   0.          0.          0.          0.88371438]
 [ 0.          0.78553353  0.85010276  0.          0.        ]
 [ 0.          0.          0.76009782  0.7865069   0.        ]
 [ 0.          0.          0.          0.          0.89097667]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.8390625 ]
 [ 0.          0.          0.8000239   0.86039339  0.73899714]]
[[ 0.7157078   0.78553353  0.80340816  0.82345014  0.88371438]
 [ 0.7157078   0.78553353  0.85010276  0.82345014  0.83818767]
 [ 0.7157078   0.78553353  0.76009782  0.7865069   0.83818767]
 [ 0.7157078   0.78553353  0.80340816  0.82345014  0.89097667]
 [ 0.7157078   0.78553353  0.80340816  0.82345014  0.83818767]
 [ 0.7157078   0.78553353  0.80340816  0.82345014  0.8390625 ]
 [ 0.7157078   0.78553353  0.8000239   0.86039339  0.73899714]]


### Generating Polynomial Features

In [14]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)

print X
print poly.fit_transform(X)

[[ 0.          0.          0.          0.          0.8390625 ]
 [ 0.          0.78553353  0.85010276  0.          0.        ]
 [ 0.97757077  0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.8000239   0.86039339  0.73899714]
 [ 0.          0.          0.76009782  0.7865069   0.        ]
 [ 0.          0.          0.          0.          0.89097667]
 [ 0.910608    0.83239223  0.97092464  0.70809331  0.84331188]
 [ 0.7157078   0.          0.          0.          0.88371438]]
[[ 1.          0.          0.         ...,  0.          0.          0.41588337]
 [ 1.          0.          0.78553353 ...,  0.          0.          0.        ]
 [ 1.          0.97757077  0.         ...,  0.          0.          0.        ]
 ..., 
 [ 1.          0.          0.         ...,  0.          0.          0.5614766 ]
 [ 1.          0.910608    0.83239223 ...,

## Creating Models

### Supervised Learning Estimators

#### Linear Regression

In [15]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize = True)

#### Support Vector Machines (SVM)

In [16]:
from sklearn.svm import SVC
svc = SVC(kernel = "linear")

#### Naive Bayes

In [17]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

#### KNN

In [18]:
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors = 3)

### Unsupervised Learning Estimators

#### Principal Component Analysis (PCA)

In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)

#### K Means

In [20]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters = 3, random_state = 0)

## Model Fitting

### Supervised Learning

In [21]:
# Fit the model to the data
print lr.fit(X, y)
print knn.fit(X_train, y_train)
print svc.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


### Unsupervised Learning

In [22]:
k_means.fit(X_train) # fit the model to the data
pca_model = pca.fit_transform(X_train) # Fit to data, then transform it

## Prediction

### Supervised Estimators

In [23]:
print svc.predict(np.random.random((2, 5)))
print lr.predict(X_test)
print knn.predict_proba(X_test)

['M' 'M']
[-0.0784857  -0.02429168  0.74195058]
[[ 0.66666667  0.33333333]
 [ 0.66666667  0.33333333]
 [ 0.33333333  0.66666667]]


### Unsupervised Estimators

In [24]:
print k_means.predict(X_test)

[1 0 1]


## Evaluate Your Model's Performance

### Classification Metrics

#### Accuracy Score

In [25]:
knn.score(X_test, y_test)

1.0

In [26]:
print y_test
y_pred = knn.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

['F' 'F' 'M']


1.0

#### Classification Report

Report ***precision***, ***recall***, ***f1-score***, and ***support***.

In [27]:
from sklearn.metrics import classification_report
print classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          F       1.00      1.00      1.00         2
          M       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00         3



#### Confusion Matrix

In [28]:
from sklearn.metrics import confusion_matrix
print confusion_matrix(y_test, y_pred)

[[2 0]
 [0 1]]


### Regressionn Metrics

#### Mean Absolute Error

In [29]:
from sklearn.metrics import mean_absolute_error
y_pred = lr.predict(X_test)
y_true = [3, -0.5, 2]
print y_true, y_pred
mean_absolute_error(y_true, y_pred)

[3, -0.5, 2] [-0.0784857  -0.02429168  0.74195058]


1.6040811487050055

#### Mean Squared Error

In [30]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_true, y_pred)

3.7620203220190249

#### R^2 Score

In [31]:
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)

-0.73631707170108851

### Clustering Metrics

#### Adjusted Rand Index

In [32]:
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_true, y_pred)

1.0

#### Homogeneity

In [33]:
from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_pred)

1.0

#### V-measure

In [34]:
from sklearn.metrics import v_measure_score
v_measure_score(y_true, y_pred)

1.0

### Cross-Validation

In [35]:
from sklearn.model_selection import cross_val_score
print cross_val_score(knn, X_train, y_train, cv = 3)

[ 0.  0.  0.]


## Tune Your Model

### Grid Search

In [36]:
from sklearn.model_selection import GridSearchCV
params = {"n_neighbors": np.arange(1,3),
              "metric": ["euclidean", "cityblock"]}
grid = GridSearchCV(estimator=knn,
                    param_grid=params)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)
print(grid.best_estimator_)

0.285714285714
1
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


### Randomized Parameter Optimization

In [37]:
from sklearn.model_selection import RandomizedSearchCV
params = {"n_neighbors": range(1,5),
          "weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn,
                             param_distributions=params, cv=4,
                             n_iter=8,
                             random_state=5)
rsearch.fit(X_train, y_train)
print(rsearch.best_score_)
print(rsearch.best_estimator_)

0.571428571429
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='distance')


