In [14]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# One-Hot Encoding

In [5]:
X=np.array(["Bream",'Perch','Pike','Roach','Bream','Perch',
            'Pike','Roach','Bream','Perch','Pike','Roach']).reshape(-1,1)
X

array([['Bream'],
       ['Perch'],
       ['Pike'],
       ['Roach'],
       ['Bream'],
       ['Perch'],
       ['Pike'],
       ['Roach'],
       ['Bream'],
       ['Perch'],
       ['Pike'],
       ['Roach']], dtype='<U5')

In [21]:
X1=pd.DataFrame(X).rename(columns={0:'Fish'})

In [23]:
X1.head(3)

Unnamed: 0,Fish
0,Bream
1,Perch
2,Pike


## Forma 1

In [24]:
X2=pd.get_dummies(X1)

In [27]:
X2.head()

Unnamed: 0,Fish_Bream,Fish_Perch,Fish_Pike,Fish_Roach
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0


In [44]:
y=np.array([100,200,300,800,200,100,800,800,100,800,100,200])

In [34]:
reg=LinearRegression().fit(X2,y)

In [35]:
reg.coef_

array([-241.66666667,   -8.33333333,   25.        ,  225.        ])

## Forma 2

In [45]:
enc=OneHotEncoder(handle_unknown='ignore')
enc.fit(X)
XHot=enc.transform(X)
XHot


<12x4 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [47]:
reg=LinearRegression().fit(XHot, y)

In [49]:
reg.score(XHot,y)

0.2932442464736452

In [52]:
X_test=np.array(['Bream','Perch','Pike','Roach']).reshape(-1,1)
X_test_trans=enc.transform(X_test)
reg.predict(X_test_trans)

array([133.33333333, 366.66666667, 400.        , 600.        ])

In [55]:
X.reshape(-1) 

array(['Bream', 'Perch', 'Pike', 'Roach', 'Bream', 'Perch', 'Pike',
       'Roach', 'Bream', 'Perch', 'Pike', 'Roach'], dtype='<U5')

In [59]:
bollybream=y[X.reshape(-1) == 'Bream']
bollyperch=y[X.reshape(-1) == 'Perch']
bollypike=y[X.reshape(-1) == 'Pike']
bollyroach=y[X.reshape(-1) == 'Roach']

In [60]:
print(sum(bollybream)/len(bollybream),
       sum(bollyperch)/len(bollyperch),
       sum(bollypike)/len(bollypike),
       sum(bollyroach)/len(bollyroach))

133.33333333333334 366.6666666666667 400.0 600.0


# Treinar um modelo de regressão linear em n dimensões

In [2]:
import numpy as np
X = np.array([[1, 1, 3], [1, 2, 1], [2, 2, 4], [2, 3, 5]])
# y = 1 * x_1 + 2 * x_2 + 5 * x_3 + 3
y = np.dot(X, np.array([1, 2, 5])) + 3

In [3]:
X[:5]

array([[1, 1, 3],
       [1, 2, 1],
       [2, 2, 4],
       [2, 3, 5]])

In [18]:
y[:5]

array([21, 13, 29, 36])

In [5]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X, y)

In [6]:
print(reg.coef_)

print(reg.intercept_)

[1. 2. 5.]
3.0000000000000036


In [7]:
reg.predict(np.array([[1, 2, 3]]))

array([23.])

In [8]:
X_test = np.array([[1, 2, 3], [8, 9, 10], [2, 2, 4]])
y_pred = reg.predict(X_test)
y_pred

array([23., 79., 29.])

In [9]:
y_true = np.dot(X_test, np.array([1, 2, 5])) + 3
y_true

array([23, 79, 29])

In [19]:
import joblib
filename = "reg.joblib"
joblib.dump(reg, filename)

['reg.joblib']

In [20]:
loaded_reg = joblib.load(filename)
loaded_reg

LinearRegression()

## Carregar e reutilizar o modelo Scikit-Learn

In [21]:
y_pred = loaded_reg.predict(X_test)

In [22]:
y_pred

array([23., 79., 29.])

# Como contornar esses limites (dos modelos lineares)  transformações de variáveis?

In [30]:
np.arange(6)

array([0, 1, 2, 3, 4, 5])

In [24]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

X = np.arange(6).reshape(3, 2)
X

array([[0, 1],
       [2, 3],
       [4, 5]])

In [25]:
y = np.array([100, 200, 300])

In [26]:
# Generate a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the 
# specified degree. For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are 
# [1, a, b, a^2, ab, b^2].
poly = PolynomialFeatures(2)
poly.fit_transform(X)
#  ou seja: [1 a b a² ab b²
#            1 a b a² ab b²
#            1 a b a² ab b²]

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [27]:
X_tranformed = poly.fit_transform(X)

In [28]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_tranformed, y)

In [29]:
print(reg.coef_)

print(reg.intercept_)

[ 7.10542736e-15  1.25000000e+01  1.25000000e+01 -1.25000000e+01
 -3.01980663e-14  1.25000000e+01]
74.99999999999993


# Regressão não linear - Random Forest

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
# Generate a random regression problem
X, y = make_regression(n_features=4, n_informative=2,
                       random_state=0, shuffle=False)
X[:3]

array([[ 1.76405235,  0.40015721,  0.97873798,  2.2408932 ],
       [ 1.86755799, -0.97727788,  0.95008842, -0.15135721],
       [-0.10321885,  0.4105985 ,  0.14404357,  1.45427351]])

In [34]:
y[:3]

array([49.82290745,  4.87728597, 11.91487464])

In [35]:
# Model training + prediction
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X, y)

RandomForestRegressor(max_depth=2, random_state=0)

In [36]:
print(regr.predict([[1, 2, 3, 4]]))

[52.79050925]


In [37]:
print(regr.predict([[1, 2, 3, 4], [6, 2, 3, 5]]))

[52.79050925 61.7155606 ]
