# Scikit-Learn - Getting started

In [48]:
# Fitting and predicting estimator basics

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint


In [49]:
clf = RandomForestClassifier(random_state = 0)

In [50]:
X = [[1,2,3],
     [11,12,13]]

y = [0,1] 

clf.fit(X,y)

RandomForestClassifier(random_state=0)

In [51]:
clf.predict(X) # Predict classes of the training data


array([0, 1])

In [52]:
clf.predict([[4,6,7], [75,56,32]])

array([0, 1])

## Transformers and preprocessors

In [53]:
X = [[11,15],
     [12,53]]

StandardScaler().fit(X).transform(X)

array([[-1., -1.],
       [ 1.,  1.]])

## Pipelines: chaining pre-processors and estimators

In [54]:
# Create a pipeline object
pipe = make_pipeline(StandardScaler(), LogisticRegression())


In [55]:
# Load the Iris dataset and split it into train and test sets
X, y = load_iris(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [56]:
# fit the whole pipeline
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [57]:
# Using it as any other estimator
accuracy_score(pipe.predict(X_test), y_test)

0.9736842105263158

## Model Evaluation

In [58]:
X, y = make_regression(n_samples=1000, random_state=0)
lr = LinearRegression()

In [59]:
result = cross_validate(lr, X,y)
result['test_score']

array([1., 1., 1., 1., 1.])

## Automatic parameter searches

In [60]:
X, y = fetch_california_housing(return_X_y=True)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)


In [62]:
# Define the parameter space that will be searched over
param_distributions = {'n_estimators': randint(1,5),
                       'max_depth': randint(5,10)}


In [64]:
# creating a searchCV object and fitting it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter = 5,
                            param_distributions=param_distributions,
                            random_state = 0)

In [65]:
search.fit(X_train, y_train)

RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000011721A64D60>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000011773478A00>},
                   random_state=0)

In [66]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [67]:
search.score(X_test, y_test)

0.735363411343253