In [None]:
import seaborn as sns
# load into Pandas's DataFrame
iris = sns.load_dataset('iris')
print(iris.shape)
iris.tail()

In [None]:
iris.head()

In [None]:
sns.set()
sns.pairplot(iris,hue="species",height=2)

In [None]:
# prepare a features matrix 
# scikit-learn can use features matrix of a Numpy ndarray or DataFrame object 
# here use a DataFrame object, with shape [n_samples, n_features]
# by convention, the features matrix is called X
X_iris = iris.drop('species', axis=1)
print(X_iris.shape)
print(type(X_iris))

In [None]:
# create the label or target array 

# scikit-learn can use label of a Numpy ndarray or Pandas Series object 
# here use a Series object, with shape [n_samples,] 
# by convention, the label is called y
y_iris = iris['species']
print(y_iris.shape)
print(type(y_iris))

![data layout](data_layout.png)


# Scikit-learn Estimator API
1. Every machine learning algorithm in Scikit-learn is implemented via the Estimator API, which privides a consistent interface 
2. Basic steps:
   - Choose a model class, such as LinearRegression
   - Instantiate the model class with desired hyperparameters
   - Prepare the data into a features matrix and a label
   - Fit the model to the data by calling the model's fit()
   - Apply the model to new data
      - for supervised learning, predict the labels using predict() method
      - for unsupervised learning, transform or infer properties of the data using transform() or predict() method

In [None]:
#simple linear regression - fit a line to (x,y) data
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import default_rng
#random number generator
rng = default_rng(seed=42) 
x = 10*rng.random(50)
y = 2*x-1+rng.standard_normal(50)
plt.scatter(x,y)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)

In [None]:
# prepare features matrix
X = x[:, np.newaxis]
X.shape

In [None]:
y.shape

In [None]:
model.fit(X,y)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
# generate an unknown data set for use in prediction
xfit = np.linspace(-1,11, num=50)
xfit

In [None]:
Xfit = xfit[:, np.newaxis]
Xfit.shape

In [None]:
yfit = model.predict(Xfit)
plt.scatter(x,y);
plt.plot(xfit,yfit)

In [None]:
#Gaussian Naive Bayes
# generate cluster data sets
from sklearn.datasets import make_blobs
X,y, centers = make_blobs(n_samples=100,n_features=2,centers=2, center_box=(-10,10), shuffle=True, random_state=2, cluster_std=1.5, return_centers=True)

In [None]:
plt.scatter(x=X[:,0],y=X[:,1],s=50, c=y,cmap="PiYG")

In [None]:
from sklearn.naive_bayes import GaussianNB
model  = GaussianNB()
model.fit(X,y)

In [None]:
rng = default_rng(seed=0)
Xnew = [-6,-14] + [14,18]*rng.random((2000,2))
ynew = model.predict(Xnew)

In [None]:
plt.scatter(X[:,0],X[:,1],c=y,s=50,cmap="PiYG")
lim = plt.axis()
plt.scatter(Xnew[:,0],Xnew[:,1],c=ynew,s=20,cmap="PiYG",alpha=0.2)
plt.axis(lim)

In [None]:
yprob = model.predict_proba(Xnew)
yprob.round(2)

In [None]:
#Multinomial Naive Bayes
#Classifying text
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()
data.target_names

In [None]:
categories  = ['talk.religion.misc', 'soc.religion.christian', 'sci.space','comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)
print(len(train.data))
print(train.data[-1])
print(train.target[-1])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [None]:
model.fit(train.data, train.target)

In [None]:
labels = model.predict(test.data)

In [None]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(test.target, labels)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
xticklabels=train.target_names, yticklabels=train.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label');
