In [1]:
############################### Machine Learning ####################################
# Machine learning involves building mathematical models to help understand data.
# Once these models have been fit to previously seen data, they can be used to predict and understand aspects of newly observed data.
# categorized into two main types: supervised learning and unsupervised learning
# In a supervised learning model, the algorithm learns on a labeled dataset, providing an answer key that the algorithm can use to evaluate its accuracy on training data.
# An unsupervised model, in contrast, provides unlabeled data that the algorithm tries to make sense of by extracting features and patterns on its own.

In [6]:
# Required modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# scikit-learn : the python library for machine learning tasks
import sklearn 

In [7]:
############################### Data Representation in Scikit-learn  #####################################
# load a dataset
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
################ Feature Matrix and Target Array #############################
# in machine learning we always have two sets of data: train and test
##########The Feature matrix (X) is the independent vairable which we use for training
# it is assumed to be two-dimensional, with shape [n_samples, n_features]
# The samples (i.e., rows) always refer to the individual objects described by the dataset
# The features (i.e., columns) always refer to the distinct observations that describe each sample in a quantitative manner. 
########## The Target Array (y) is the dependent variable which we want to predict from the data

In [9]:
# Back to iris dataset
# we may wish to construct a model that can predict the species of flower based on the other measurements
# in this case, the species column would be considered the traget array, while other columns along with observation values construct the Feature matrix 

In [12]:
# extract features matrix ( the X)
X_iris = iris.drop('species', axis=1) # drop species coz we will predict based on it; the independent variable
X_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [13]:
# import the target array ( the y)
y_iris = iris['species']
y_iris.head()

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: species, dtype: object

In [14]:
############################## Basics Steps with scikit-learn ###############################
# (1) Choose a class of model
# (2) Choose model hyperparameters by instantiating this class with desired values
# (3) Arrange data into a features matrix and target vector
# (4) Fit the model to your data by calling the fit() method
# (5) Apply the model to new data:
## For supervised learning, often we predict labels for unknown data using the predict() method.
## For unsupervised learning, we often transform or infer properties of the data using the transform() or predict() method