<a href="https://colab.research.google.com/github/ralsouza/machine_learning_python/blob/master/notebooks/03_challenge_pca_pipeline_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Predictive classification model to predict the binary variable value (true or false) from numeric data.

In [142]:
# Import modules
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

# 2. Data pre-processing

## 2.1 Train Data

In [105]:
# Train data
n_train = 10
np.random.seed(0)
df_train = pd.DataFrame({"var1": np.random.random(n_train), \
                         "var2": np.random.random(n_train), \
                         "var3": np.random.random(n_train), \
                         "var4": np.random.randint(0,2,n_train).astype(bool),\
                         "target": np.random.randint(0,2,n_train).astype(bool)})

In [106]:
# Shape
df_train.shape

(10, 5)

In [107]:
# Head
df_train.head()

Unnamed: 0,var1,var2,var3,var4,target
0,0.548814,0.791725,0.978618,True,True
1,0.715189,0.528895,0.799159,False,True
2,0.602763,0.568045,0.461479,True,False
3,0.544883,0.925597,0.780529,False,False
4,0.423655,0.071036,0.118274,True,False


### 2.1.2 Separating train data in input and output components

In [108]:
# Predict variables
x_train = df_train.iloc[:,0:4]

In [109]:
# Target variable
y_train = df_train.iloc[:,4]

## 2.2 Test Data

In [110]:
# Test data
n_test = 3
np.random.seed(1)
df_test = pd.DataFrame({"var1": np.random.random(n_test), \
                        "var2": np.random.random(n_test), \
                        "var3": np.random.random(n_test), \
                        "var4": np.random.randint(0,2,n_test).astype(bool),\
                        "target": np.random.randint(0,2,n_test).astype(bool)})


In [111]:
# Shape
df_test.shape

(3, 5)

In [112]:
# Head
df_test.head()

Unnamed: 0,var1,var2,var3,var4,target
0,0.417022,0.302333,0.18626,False,True
1,0.720324,0.146756,0.345561,False,False
2,0.000114,0.092339,0.396767,False,False


### 2.2.2 Separating test data in input and output components

In [113]:
# Predict variables
x_test = df_test.iloc[:,0:4]

In [114]:
# Target variable
y_test = df_test.iloc[:,4]

# 3. Create a PCA model with dimensionality reduction with 3 components

In [115]:
# Setting PCA
pca = PCA(n_components = 3)

# 3.1 Apply the PCA to datasets

In [116]:
# Reduce train
pca_x_train = pca.fit_transform(x_train)

# Reduce test
pca_x_test = pca.fit_transform(x_test)

In [117]:
# Show pca x train values
pca_x_train

array([[ 0.41587301, -0.71062799, -0.11371247],
       [-0.33697959,  0.03003896, -0.17654096],
       [ 0.63605658, -0.29090443,  0.05028171],
       [-0.43846812, -0.20849886,  0.15980423],
       [ 0.94815762,  0.2527858 ,  0.03898095],
       [-0.12006504,  0.42440996, -0.31974987],
       [ 0.0903311 ,  0.76011682,  0.04769318],
       [-0.52214167, -0.27678453, -0.15929367],
       [-0.39311411, -0.02468934,  0.04231957],
       [-0.27964979,  0.04415361,  0.43021735]])

In [118]:
# Show pca x test values
pca_x_test

array([[-0.06592466,  0.16471905,  0.        ],
       [-0.32837335, -0.10489891,  0.        ],
       [ 0.39429801, -0.05982013,  0.        ]])

# 4. Create dataframes do arrays created by PCA

In [119]:
df_pca_x_train = pd.DataFrame(pca_x_train)
df_pca_x_test  = pd.DataFrame(pca_x_test)

In [120]:
# Check values
df_pca_x_train.head()

Unnamed: 0,0,1,2
0,0.415873,-0.710628,-0.113712
1,-0.33698,0.030039,-0.176541
2,0.636057,-0.290904,0.050282
3,-0.438468,-0.208499,0.159804
4,0.948158,0.252786,0.038981


In [121]:
# Check values
df_pca_x_test.head()

Unnamed: 0,0,1,2
0,-0.065925,0.164719,0.0
1,-0.328373,-0.104899,0.0
2,0.394298,-0.05982,0.0


# 4. Create a Logistic Regression model

## 4.1 Create model v1 - manually

In [135]:
# Make the model v1
model_v1 = LogisticRegression()

### 4.1.1 Check performance

In [140]:
# Cross validation
result_v1 = cross_val_score(model_v1,pca_x_train,y_train)

In [141]:
# Print result
print('Accuracy: %.3f' % (result_v1.mean() * 100 ))

Accuracy: 50.000


## 4.2 Create model v2 - manually

In [125]:
# Make the model v2
model_v2 = LogisticRegression()

### 4.2.1 Check Performance

In [128]:
model_v2.fit(pca_x_train,y_train)
model_v2.score(pca_x_train,y_train)

0.8

In [129]:
# Prediction
predict_values = model_v2.predict(pca_x_test)
predict_values

array([ True,  True, False])

## 4.3 Create model v3 - manually

In [130]:
clf = LogisticRegression().fit(pca_x_train, y_train)

In [158]:
clf.predict(pca_x_test)

array([ True,  True, False])

### 4.3.1 Check Performance

In [134]:
clf.score(pca_x_test, y_test)

0.6666666666666666

# 5. Using the pipeline resource from scikit-learn to chain 2 algorithms in the same model, concat the result of PCA and Logistic Regression.
[6.1. Pipelines and composite estimators](https://scikit-learn.org/stable/modules/compose.html#pipeline)


In [146]:
# See also: make_pipeline
pipe = Pipeline(steps=[('reduce_dim',PCA()),('log_reg',LogisticRegression())])

In [147]:
print(pipe)

Pipeline(memory=None,
         steps=[('reduce_dim',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('log_reg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)


In [150]:
pipe.fit(pca_x_train,y_train)

Pipeline(memory=None,
         steps=[('reduce_dim',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('log_reg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

# 6. Make predictions with the trained model

In [154]:
predictions = pipe.predict(pca_x_test)

# 7. Print outcomes and performance model

In [155]:
print('predictions:')
print(predictions)

predictions:
[ True  True False]


In [153]:
# Performance
pipe.score(pca_x_test,y_test)

0.6666666666666666