<a href="https://colab.research.google.com/github/rmonterof/Scikit-learn/blob/main/1_0_Scikit_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scikit Learn

In [None]:
import sklearn
import pandas as pd
sklearn.__version__

'1.6.1'

## Datasets

### load

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
print(type(iris))

<class 'sklearn.utils._bunch.Bunch'>


In [None]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

In [None]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
df['class'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [None]:
df.shape

(150, 5)

### fetch

In [None]:
housing = datasets.fetch_california_housing()
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [None]:
datasets.load_*?

datasets.load_breast_cancer
datasets.load_diabetes
datasets.load_digits
datasets.load_files
datasets.load_iris
datasets.load_linnerud
datasets.load_sample_image
datasets.load_sample_images
datasets.load_svmlight_file
datasets.load_svmlight_files
datasets.load_wine

In [None]:
datasets.fetch_*?

datasets.fetch_20newsgroups
datasets.fetch_20newsgroups_vectorized
datasets.fetch_california_housing
datasets.fetch_covtype
datasets.fetch_kddcup99
datasets.fetch_lfw_pairs
datasets.fetch_lfw_people
datasets.fetch_olivetti_faces
datasets.fetch_openml
datasets.fetch_rcv1
datasets.fetch_species_distributions

In [None]:
# https://www.openml.org/search?type=data
X, y = datasets.fetch_openml(data_id=31, as_frame=True, return_X_y=True, parser='auto')

n_categorical_features = (X.dtypes == 'category').sum()
n_numerical_features = (X.dtypes == 'float').sum()
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of categorical features: {n_categorical_features}")
print(f"Number of numerical features: {n_numerical_features}")

Number of samples: 1000
Number of features: 20
Number of categorical features: 13
Number of numerical features: 0


## Estimadores

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [None]:
from sklearn.utils import all_estimators

estimators = all_estimators()
print(len(estimators))

207


In [None]:
for estimator in estimators:
    print(estimator)

('ARDRegression', <class 'sklearn.linear_model._bayes.ARDRegression'>)
('AdaBoostClassifier', <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>)
('AdaBoostRegressor', <class 'sklearn.ensemble._weight_boosting.AdaBoostRegressor'>)
('AdditiveChi2Sampler', <class 'sklearn.kernel_approximation.AdditiveChi2Sampler'>)
('AffinityPropagation', <class 'sklearn.cluster._affinity_propagation.AffinityPropagation'>)
('AgglomerativeClustering', <class 'sklearn.cluster._agglomerative.AgglomerativeClustering'>)
('BaggingClassifier', <class 'sklearn.ensemble._bagging.BaggingClassifier'>)
('BaggingRegressor', <class 'sklearn.ensemble._bagging.BaggingRegressor'>)
('BayesianGaussianMixture', <class 'sklearn.mixture._bayesian_mixture.BayesianGaussianMixture'>)
('BayesianRidge', <class 'sklearn.linear_model._bayes.BayesianRidge'>)
('BernoulliNB', <class 'sklearn.naive_bayes.BernoulliNB'>)
('BernoulliRBM', <class 'sklearn.neural_network._rbm.BernoulliRBM'>)
('Binarizer', <class 'sklearn.preproce

In [None]:
regressors = all_estimators(type_filter="regressor")
for regressor in regressors:
    print(regressor)

('ARDRegression', <class 'sklearn.linear_model._bayes.ARDRegression'>)
('AdaBoostRegressor', <class 'sklearn.ensemble._weight_boosting.AdaBoostRegressor'>)
('BaggingRegressor', <class 'sklearn.ensemble._bagging.BaggingRegressor'>)
('BayesianRidge', <class 'sklearn.linear_model._bayes.BayesianRidge'>)
('CCA', <class 'sklearn.cross_decomposition._pls.CCA'>)
('DecisionTreeRegressor', <class 'sklearn.tree._classes.DecisionTreeRegressor'>)
('DummyRegressor', <class 'sklearn.dummy.DummyRegressor'>)
('ElasticNet', <class 'sklearn.linear_model._coordinate_descent.ElasticNet'>)
('ElasticNetCV', <class 'sklearn.linear_model._coordinate_descent.ElasticNetCV'>)
('ExtraTreeRegressor', <class 'sklearn.tree._classes.ExtraTreeRegressor'>)
('ExtraTreesRegressor', <class 'sklearn.ensemble._forest.ExtraTreesRegressor'>)
('GammaRegressor', <class 'sklearn.linear_model._glm.glm.GammaRegressor'>)
('GaussianProcessRegressor', <class 'sklearn.gaussian_process._gpr.GaussianProcessRegressor'>)
('GradientBoostin