# Introduction to Machine Learning with Python
## A Guide for Data Scientists
## by Andreas C. Muller & Sarah Guido

In [None]:
import numpy as np

x = np.array([[1, 2, 3], [4, 5, 6]])
print("x: \n{}".format(x))

In [47]:
from scipy import sparse

# creating a 2D NumPy array with a diagonal of ones and zeros everywhere else
eye = np.eye(4)
print("2D NumPy Array: \n{}".format(eye))

2D NumPy Array: 
[[ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]]


In [48]:
# converting the NumPy array to a SciPy matrix in CSR format (only the non-zero entries are stored)
sparse_matrix = sparse.csr_matrix(eye)
print("SciPy sparse CSR matrix: \n{}".format(sparse_matrix))

SciPy sparse CSR matrix: 
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0


In [2]:
# matplotlib

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

# generating a sequence of numbers from -10 to 10 with 100 steps in between
x = np.linspace(-10, 10, 100)

# creating a second array using sine
y = np.sine(x)

# plot function makes a line chart of one array against another
plt.plot(x, y, marker = "x")


AttributeError: module 'numpy' has no attribute 'sine'

In [54]:
import pandas as pd 

# creating a simple dataset

data = {
    'Name': ['John', 'Ana', 'Peter', 'Linda'],
    'Location': ['New York', 'Paris', 'Berlin', 'London'],
    'Age': [24, 15, 50, 35]
}

data_pandas = pd.DataFrame(data)

# IPython.display allows prettu printing of dataframes (as Table)
display(data_pandas)

Unnamed: 0,Age,Location,Name
0,24,New York,John
1,15,Paris,Ana
2,50,Berlin,Peter
3,35,London,Linda


In [53]:
# Query
# select all rows that have age column greater than 30
display(data_pandas[data_pandas.Age > 30])

Unnamed: 0,Age,Location,Name
2,50,Berlin,Peter
3,35,London,Linda


In [55]:
# checking Versions

import sys
print("Python Version: {}".format(sys.version))

import pandas as pd
print("Pandas Version: {}".format(pd.__version__))

import numpy as np
print("NumPy Version: {}".format(pd.__version__))

import scipy as sp
print("SciPy Version: {}".format(sp.__version__))

import matplotlib
print("Matplotlib Version: {}".format(matplotlib.__version__))

import IPython
print("IPython Version: {}".format(IPython.__version__))

import sklearn
print("SciKit-Learn Version: {}".format(sklearn.__version__))

Python Version: 3.6.3 |Anaconda, Inc.| (default, Oct 15 2017, 03:27:45) [MSC v.1900 64 bit (AMD64)]
Pandas Version: 0.20.3
NumPy Version: 0.20.3
SciPy Version: 0.19.1
Matplotlib Version: 2.1.0
IPython Version: 6.1.0
SciKit-Learn Version: 0.19.1


In [56]:
# A first Application : Classifying Iris Specis

# meet the data
from sklearn.datasets import load_iris
iris_dataset = load_iris()

print("Keys of the iris dataset: \n{}".format(iris_dataset.keys()))

Keys of the iris dataset: 
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])


In [57]:
# short description of the dataset (DESCR)

print(iris_dataset['DESCR'][:193] + "\n...")

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive att
...


In [58]:
# species of the flower that we want to predict
print("Target names: {}".format(iris_dataset['target_names']))

# description of each feature
print("Feature names: {}".format(iris_dataset['feature_names']))

Target names: ['setosa' 'versicolor' 'virginica']
Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [42]:
# Type of variables

print("Type of data: {}".format(type(iris_dataset['data'])))
print("Type of Target: {}".format(type(iris_dataset['target_names'])))

Type of data: <class 'numpy.ndarray'>
Type of Target: <class 'numpy.ndarray'>


In [60]:
# Shape of data

print("Shape of data: {}".format(iris_dataset['data'].shape))

Shape of data: (150, 4)


In [46]:
# Viewing data
print("First five columns of data: \n{}".format(iris_dataset['data'][:5]))

First five columns of data: 
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]


In [62]:
# Shape of target
print("Shape of target: {}".format(iris_dataset['target'].shape))

# Specis are encoded as integers from 0 to 2
print("Target: \n{}".format(iris_dataset['target']))

Shape of target: (150,)
Target: 
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [77]:
# Train and Test data by train_test_split function (75% - 25%)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state = 0)

# X_train, X_test, y_train, y_test all are NumPy arrays
print(X_train[:5])
print("\n")
print(y_train[:5])

[[ 5.9  3.   4.2  1.5]
 [ 5.8  2.6  4.   1.2]
 [ 6.8  3.   5.5  2.1]
 [ 4.7  3.2  1.3  0.2]
 [ 6.9  3.1  5.1  2.3]]


[1 1 2 0 2]


In [75]:
# Shape of train dataset

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))

X_train shape: (112, 4)
y_train shape: (112,)


In [76]:
# Shape of test dataset

print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

X_test shape: (38, 4)
y_test shape: (38,)


In [82]:
import mglearn

# creating a dataframe from data in X_train
# labeling the columns using the strings in iris_dataset.feature_names

iris_dataframe = pd.DataFrame(X_train, columns = iris_dataset.feature_names)


# creating a scatter matrix from the dataframe, color by y_train
grr = pd.scatter_matrix(iris_dataframe, c = y_train, figsize = (15, 15), marker = "o", hist_kwds = {'bins': 20},
                       s = 60, alpha = 8, cmap = mglearn.cm3)

ModuleNotFoundError: No module named 'mglearn'

In [80]:
# Building the very firs MODEL : K-Nearest Neighbors

from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier(n_neighbors = 1)

# to build the model on training set, lets call the fit method of the kn object (object creation actually)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [85]:
# Making Predictions

X_new = np.array([[5, 2.9, 1, .2]])
print("X_new shape: {}".format(X_new.shape))

X_new shape: (1, 4)


In [86]:
# calling method of the knn object

prediction = knn.predict(X_new)
print("Prediction: {}".format(prediction))
print("Predicted Target name: {}".format(iris_dataset['target_names'][prediction]))

Prediction: [0]
Predicted Target name: ['setosa']


In [89]:
# another
X_newer = np.array([[4.2, 3.3, 1.7, .5]])

prediction_newer = knn.predict(X_newer)
print("Prediction: {}".format(prediction_newer))
print("Predicted target name: {}".format(iris_dataset['target_names'][prediction_newer]))

Prediction: [0]
Predicted target name: ['setosa']


In [90]:
# Evaluating model

y_pred = knn.predict(X_test)
print("Test set predictions: \n{}".format(y_pred))

Test set predictions: 
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]


In [95]:
# Calculating score
print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))

# accuracy
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

Test set score: 0.97
Test set score: 0.97


In [98]:
# Summary - All in once snippet

X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state = 0)

knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(X_train, y_train)

print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

Test set score: 0.97
