## Sklearn
* Scikit Learn
* https://scikit-learn.org/stable/
* built on Numpy, Scipy, Matplotlib 
* works with pandas
* workflow might be
    * pandas/numpy to load and manipulate data
    * sklearn to build and validate a model
    * matplotlib to visualize results
    * pandas/numpy to save results
* started by a Google intern in 2007 and opensource for anyone to use
* also has build in metric calculations, feature extraction and transformation tools

#### models follow a pretty similar syntax

#### very well documented code base, easy to follow and understand as well as extract information from a trained model

## Logistic Regression
* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [3]:
df = pd.read_csv("../data/iris.csv")

In [4]:
df.head(1)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa


#### we will make a binary classificaiton, where 1 is the Setosa class and 0 is all other classes

In [6]:
df["classification"] = np.where(df["variety"] == "Setosa",1, 0)

In [7]:
df.sample(5)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,classification
128,6.4,2.8,5.6,2.1,Virginica,0
77,6.7,3.0,5.0,1.7,Versicolor,0
2,4.7,3.2,1.3,0.2,Setosa,1
58,6.6,2.9,4.6,1.3,Versicolor,0
136,6.3,3.4,5.6,2.4,Virginica,0


#### split out our features and our target

In [16]:
x = df.drop(["variety", "classification"], axis = 1)

In [17]:
y = df["classification"]

In [18]:
print(x)
print(y)

     sepal.length  sepal.width  petal.length  petal.width
0             5.1          3.5           1.4          0.2
1             4.9          3.0           1.4          0.2
2             4.7          3.2           1.3          0.2
3             4.6          3.1           1.5          0.2
4             5.0          3.6           1.4          0.2
..            ...          ...           ...          ...
145           6.7          3.0           5.2          2.3
146           6.3          2.5           5.0          1.9
147           6.5          3.0           5.2          2.0
148           6.2          3.4           5.4          2.3
149           5.9          3.0           5.1          1.8

[150 rows x 4 columns]
0      1
1      1
2      1
3      1
4      1
      ..
145    0
146    0
147    0
148    0
149    0
Name: classification, Length: 150, dtype: int32


#### initialize our model

In [15]:
reg = LogisticRegression()
rafsum = np.sum

print(reg)
print(rafsum)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
<function sum at 0x000002A41897D828>


#### fit or train our model using the fit() method and passing in our x and y

In [20]:
reg.fit(x,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

#### make predictions using our model, using the predict() method and passing in some feature data
* note, feature data must be the same size/schema, we can't make a model on 5 features and pass in 10

In [21]:
yhat = reg.predict(x)

In [22]:
yhat

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [23]:
np.array(y)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

#### we can score our model using the accuracy score from sklearn

In [24]:
accuracy_score(y, yhat)

1.0

#### now we want to gather our coefficients, perhaps for interpretation.  we do so by accessing to coef_ attribuet from out model object

In [25]:
reg.coef_

array([[ 0.41021713,  1.46416217, -2.26003266, -1.02103509]])

#### ordering stays consistent, so we can zip the columns from out dataframe that was passed into the model and our coefficients
* note ethe coef_ is a nested array, so we have to get the values from the first element

In [20]:
for i in zip(x.columns, reg.coef_[0]):
    print(i)

('sepal.length', 0.41021712519841536)
('sepal.width', 1.4641621652467662)
('petal.length', -2.260032661311069)
('petal.width', -1.0210350909174157)


#### we also need to get our intercept, as this is a regression problem

In [26]:
reg.intercept_

array([0.26421853])

#### Logistic Equation
.247 + .41(sepal.length) + 1.46(sepal.width) + -2.26(petal.length) + -1.02(petal.width)

In [32]:
x.head(150)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


#### custom prediction function

In [28]:
def custom_predict(sepal_length, sepal_width, petal_length, petal_width):
    return .247 + .41*(sepal_length) + 1.46*(sepal_width) + -2.26*(petal_length) + -1.02*(petal_width)

In [33]:
pred = custom_predict(5.9, 3, 5.1, 1.8)
pred

-6.315999999999999

In [30]:
pred = custom_predict(4.9, 3.5, 1.4, .2)
pred

3.9979999999999998

In [31]:
y.head(2)

0    1
1    1
Name: classification, dtype: int32

#### let's us numpy to make this cleaener

In [44]:
reg.intercept_[0] + np.sum(np.multiply(reg.coef_[0], np.array(x.iloc[2:])))

-433.2630461266474

In [38]:
np.array(x.head(1))
here = os.getcwd()

array([[5.1, 3.5, 1.4, 0.2]])

In [45]:
def pred(features, coefs, intercept):
    return intercept + np.sum(np.multiply(features, coefs))

In [47]:
p = pred(np.array(x.iloc[149,:]), reg.coef_[0], reg.intercept_[0])
p

-6.287043672009523

#### predict probability
* Positive class probabilities are computed as
* 1 / (1 + np.exp(-self.decision_function(X))) where decision function is
* .247 + .371*(sepal_length) + 1.409*(sepal_width) + -2.152*(petal_length) + -.954*(petal_width)
* we are just applying the sigmoid function to our decision function

In [49]:
reg.predict_proba(x.head(2))

array([[0.01610102, 0.98389898],
       [0.03562213, 0.96437787]])

#### we can get the decision function using some matrix multiplication then summing across the axis and adding the intercept back in

In [48]:
# we can get the decision function using some matrix multiplication
# then summing across the axis
x.head()*reg.coef_

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,2.092107,5.124568,-3.164046,-0.204207
1,2.010064,4.392486,-3.164046,-0.204207
2,1.92802,4.685319,-2.938042,-0.204207
3,1.886999,4.538903,-3.390049,-0.204207
4,2.051086,5.270984,-3.164046,-0.204207


In [49]:
(x.head()*reg.coef_).sum(1)

0    3.848422
1    3.034298
2    3.471090
3    2.831645
4    3.953817
dtype: float64

In [54]:
import cv2

In [50]:
(x*reg.coef_).sum(1)+reg.intercept_

0      4.112641
1      3.298516
2      3.735308
3      3.095864
4      4.218035
         ...   
145   -6.695391
146   -6.731138
147   -6.471124
148   -6.766841
149   -6.287044
Length: 150, dtype: float64

In [51]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [52]:
# Positive class probabilities are computed as
sigmoid(-4.801632)

0.008149369173106819

In [53]:
1 - sigmoid(-4.801632)

0.9918506308268932

In [56]:
preds = pd.DataFrame(sigmoid((x*reg.coef_).sum(axis = 1) + reg.intercept_), columns = ["positive"])
preds["negative"] = 1 - preds["positive"]

In [57]:
preds.head(10)

Unnamed: 0,positive,negative
0,0.983899,0.016101
1,0.964378,0.035622
2,0.97669,0.02331
3,0.956722,0.043278
4,0.985486,0.014514
5,0.980907,0.019093
6,0.974889,0.025111
7,0.975854,0.024146
8,0.950121,0.049879
9,0.965143,0.034857


In [58]:
reg.predict_proba(x.head(10))

array([[0.01610102, 0.98389898],
       [0.03562213, 0.96437787],
       [0.02330951, 0.97669049],
       [0.04327818, 0.95672182],
       [0.0145138 , 0.9854862 ],
       [0.01909304, 0.98090696],
       [0.02511113, 0.97488887],
       [0.02414588, 0.97585412],
       [0.0498789 , 0.9501211 ],
       [0.03485665, 0.96514335]])

## Normalize
* remove magnitude of our features
* center data
* standard scaler
* z score norm
* min-max

In [116]:
df = pd.read_csv("../data/iris.csv")
df.head(1)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa


In [121]:
a = df.groupby('variety').first()
a.median()

sepal.length    6.3
sepal.width     3.3
petal.length    4.7
petal.width     1.4
dtype: float64

#### Standard Scaler
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [58]:
from sklearn.preprocessing import StandardScaler

In [62]:
x = df.drop("variety", 1)

In [63]:
x.head(1)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2


In [64]:
scaler = StandardScaler()
scaler.fit(x)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [66]:
# note the transform returns a numpy array
x_scaler = scaler.transform(x)
x_scale_df = pd.DataFrame(x_scaler, columns = x.columns)
x_scale_df.describe()
path = '.'

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,-2.775558e-16,-9.695948e-16,-8.652338e-16,-4.662937e-16
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


In [67]:
x_scale_df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


#### Min Max Scaler
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [68]:
from sklearn.preprocessing import MinMaxScaler

In [70]:
minmax = MinMaxScaler()
minmax.fit(x)

x_minmax = minmax.transform(x)
x_minmax = pd.DataFrame(x_minmax, columns = x.columns)
x_minmax.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


* https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing

#### center data
* https://scikit-learn.org/stable/modules/preprocessing.html

In [71]:
from sklearn.preprocessing import scale

In [72]:
scaler = scale(x)
x_scale = pd.DataFrame(scaler, columns = x.columns)
x_scale.mean(axis=0)

sepal.length   -2.775558e-16
sepal.width    -9.695948e-16
petal.length   -8.652338e-16
petal.width    -4.662937e-16
dtype: float64

## countvectorizer()
* https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [97]:
from sklearn.feature_extraction.text import CountVectorizer

In [98]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [99]:
vectorizer = CountVectorizer()

In [100]:
x = vectorizer.fit_transform(corpus)
x

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [101]:
x

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [102]:
x.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [103]:
df = pd.DataFrame(x.toarray(), columns = vectorizer.get_feature_names())

In [104]:
df

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0,1,1,1,0,0,1,0,1
1,0,2,0,1,0,1,1,0,1
2,1,0,0,1,1,0,1,1,1
3,0,1,1,1,0,0,1,0,1


In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [107]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)
df = pd.DataFrame(x.toarray(), columns = vectorizer.get_feature_names())
df

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


## Featuree Extraction
* https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text

In [108]:
from sklearn.feature_selection import VarianceThreshold

In [109]:
x = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
df = pd.DataFrame(x)
df

Unnamed: 0,0,1,2,3
0,0,2,0,3
1,0,1,4,3
2,0,1,1,3


In [110]:
df.describe()

Unnamed: 0,0,1,2,3
count,3.0,3.0,3.0,3.0
mean,0.0,1.333333,1.666667,3.0
std,0.0,0.57735,2.081666,0.0
min,0.0,1.0,0.0,3.0
25%,0.0,1.0,0.5,3.0
50%,0.0,1.0,1.0,3.0
75%,0.0,1.5,2.5,3.0
max,0.0,2.0,4.0,3.0


In [111]:
selector = VarianceThreshold()
tst = selector.fit_transform(df)
tst

array([[2, 0],
       [1, 4],
       [1, 1]], dtype=int64)

In [112]:
selector.variances_

array([0.        , 0.22222222, 2.88888889, 0.        ])

In [113]:
df.columns[selector.get_support()]

Int64Index([1, 2], dtype='int64')