<h3>Logistic Regression</h3>

In [None]:
# Logistic Regression
# Logistic regression is a type of regression that predicts a probability of an outcome
# given one or more independent variables.

# This in turn can be used for classification (predicting categories)
# Logistic regression is trained on an output variable that is discrete (a binary 1 or 0) or a 
# categorical number (which is a whole number)

# Regression methods have become an integral component of any data analysis concerned with
# describing the relationship between a response variable and a one or more explanatory variables.

# The goal of an analysis using this model is the same as that of any regression model used
# in statistics, that is to find the best fitting and most parsimonious, clinically interpretable
# model to describe the relationship between the outcome (dependent or response) variable 
# and a set of independent (predictor or explanatory) variables.

# What distinguishes a logistic regression model from the linear model is that the
# outcome variable in logistic regression is binary or dichotompous.

<h3>Performing a logistic regression</h3>

In [None]:
# Performing a logistic regression
# The logistic regression is an S-shaped curve (also known as a sigmoid curve)
# that, for a given set of input variable between 0 and 1.
# Because the output variable is between 0 and 1 it can be used to represent a probability.

# The (b0 + b1x) component in the logistic function is known as a log-odds function.
# Like linear regression, we can also extend the logistic regression to more
# than one input variable (x1, x2, ..., xn)

In [None]:
# A logistic function
import math
def predict_probability(x, b0, b1):
    p = 1.0 / (1.0 + math.exp(-(b0 + b1*x)))
    return p

# Linear regression
# f(x) = b_0 + b_1*x

In [None]:
# Assume b0 = -2.823 and b1 = 0.62
# Plotting function
from sympy import *
b0, b1, x = symbols('b0 b1 x')
p = 1.0/(1.0 + exp(-(b0 + b1*x)))
p = p.subs(b0, -2.823)
p = p.subs(b1, 0.620)

print(p)
plot(p);

In [None]:
# Fitting the logistic curve
# The data can be have any mix of decimal, integer and binary variables,
# but the output variable must be binary (0 or 1)

In [None]:
# When we do a prediction, the output variable values will be between 0 and 1, resembling a probability.

In [None]:
# using sckitlearn
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Load the data
df = pd.read_csv('https://bit.ly/33ebs2R', delimiter=',')

# Extract input variables (all rows, all columns but last column)
X = df.values[:, :-1]

# Extract output column (all rows, last column)
Y = df.values[:, -1]

# Perform logistic regression
# Turn off penalty

# Explore more argumentations
model = LogisticRegression(
    penalty=None
)

# Fitting 
model.fit(X, Y)

# Print beta1
print(model.coef_.flatten())

# print beta0
print(model.intercept_.flatten())

In [None]:
# Flattening means collapsing a matrix of numbers into a lesser dimension,
# particularly when there are fewer elements than there are dimensions.

<h3>Maximum Likelihood and Gradient Descent</h3>

In [None]:
# MAXIMUM LIKELIHOOD AND GRADIENT DESCENT
# Maximum Likelihood estimation, MLE maximizes the likelihood a given
# logistic curve would output the observed data. Essentially, the
# idea is to find the b0 and b1 coeficients that bring our logistic curve
# to those points as closely as possible,
# indicating it is the most likely to have produced those points.

<h4>Joint Likelihood</h4>

In [None]:
# Joint likelihood
# Calculating the joint likelihood of observing all the points for a given
# logistic regression

# This example uses a patient dataset that contains data on the chemical
# exposure duration and the symptomatic status of a person.

import math
import pandas as pd
patient_data = pd.read_csv('https://bit.ly/33ebs2R', delimiter=',').itertuples()
b0 = -3.1756395
b1 = 0.69267212

def logistic_function(x):
    p = 1.0 / (1.0 + math.exp(-(b0 + b1*x)))
    return p

# Calculate the joint likelihood
joint_likelihood = 1.0

for p in patient_data:
    if p.y == 1.0:
        # Get exact value on the logistic regression curve
        joint_likelihood *= logistic_function(p.x)
        
    elif p.y == 0.0:
        joint_likelihood *= (1.0 - logistic_function(p.x))

print(joint_likelihood)

In [None]:
# modified version
# Since a point is either y=0 or y=1, we can use binary elimination with x^(y) or x(1-y) 
for p in patient_data:
    joint_likelihood *= (logistic_function(p.x) ** p.y) * ((1.0 - logistic_function(p.x))**(1.0-p.y))
print(joint_likelihood)

In [None]:
# Using logarithmic addition
# Calculate the joint likelihood

# This dosen't result to the right result.
import numpy as np
joint_likelihood = 0.0
for p in patient_data:
    joint_likelihood += math.log(logistic_function(p.x))*(p.y) + \
                        math.log(1.0 - logistic_function(p.x))*(1.0 - p.y)
                        # math.log(logistic_function(p.x) ** p.y * (1.0 - logistic_function(p.x)) ** (1.0 - p.y))
joint_likelihood = math.exp(joint_likelihood)
print(joint_likelihood)

In [None]:
# Using gradient descent on logistic regression
from sympy import *
import pandas as pd

points = list(pd.read_csv('https://tinyurl.com/y2cocoo7').itertuples())

b1, b0, i, n = symbols('b1 bo i n')
x, y = symbols('x y', cls=Function)


z = (b0 + b1*x(i))
joint_likelihood = Sum(log(
    (1.0 / (1.0 + exp(-z)))**y(i) \
                           * (1.0 - (1.0 / (1.0 + exp(-z)))) ** (1 - y(i))),
                      (i, 0, n)
                      )

joint_likelihood

# Partial derivatives for m, with points sunstituted
# d_b1 = diff(joint_likelihood, b1)
# print(d_b1)
# \
#         .subs(n, len(points) - 1).doit() \
#         .replace(x, lambda i: points[i].x) \
#         .replace(y, lambda i: points[i].y)

# # Partial derivative for m, with points substituted
# d_b0 = diff(joint_likelihood, b0) \
#         .subs(n, len(points) - 1).doit() \
#         .replace(x, lambda i: points[i].x) \
#         .replace(y, lambda i: points[i].y)

# # Compile using lambdify for faster computation
# d_b1 = lambdify([b1, b0], d_b1)
# d_b0 = lambdify([b1, b0], d_b0)

# # Perform gradient descent
# b1 = 0.01
# b0 = 0.01
# L = 0.01

# for i in range(10_000):
#     b1 += d_b1(b1, b0) * L
#     b0 += d_b0(b1, b0) * L

# print(b1, b0)

In [None]:
# # Partial derivative for m, with points substituted
d_b0 = diff(joint_likelihood, b0)
d_b0

In [None]:
# Partial derivatives for m, with points substituted
d_b1 = diff(joint_likelihood, b1)
d_b1

In [None]:
# d_b0 = diff(joint_likelihood, b0) 
# d_b0

<h3>Multivariable Logistic Regression</h3>

In [None]:
# Multivariable Logistic Regression
# Example: Predict employee quit
import pandas as pd
from sklearn.linear_model import LogisticRegression
employee_data = pd.read_csv('https://tinyurl.com/y6r7qjrp')

# grab independent variable columns
inputs = employee_data.iloc[:, :-1]

# grab dependent 'did_quit' variable column
output = employee_data.iloc[:, -1]

# Build logistic regression
fit = LogisticRegression(
    penalty=None
).fit(inputs,
      output)

# print coefficients
print('Coefficients: {0}'.format(fit.coef_.flatten()))
print('Intercept: {0}'.format(fit.intercept_.flatten()))

# Interact and test new employee data
def predict_employee_will_stay(sex, age, promotions, years_employed):
    prediction = fit.predict(
        [[sex, age, promotions, years_employed]]
    )
    probabilities = fit. predict_proba(
    [[sex, age, promotions, years_employed]]
    )

    if prediction == [[1]]:
        return 'WILL LEAVE: {0}'.format(probabilities)
    else:
        return 'WILL STAY: {0}'.format(probabilities)

# Test a prediction
# while True:
    # n = input('Predict employee will stay or leave {sex}, {age}, {promotions}, {years_employed}: ')
    # (sex, age, promotions, years_employed) = n.split(', ')
    # print(predict_employee_will_stay(int(sex), int(age), int(promotions), int(years_employed)
    #                                 ))

<h3>Understanding log-odds</h3>

In [None]:
# Understanding log-odds
# Explore more on this.

<h3>R-Squared Value</h3>

In [None]:
# R-Squared value for logistic Regression

In [None]:
# Calculating the log likelihood of the fit
from math import log, exp
import pandas as pd

patient_data = pd.read_csv('https://bit.ly/33ebs2R',
                           delimiter=',').itertuples()

b0 = -3.17576395
b1 = 0.69267212

def logistic_function(x):
    p = 1.0 / (1.0 + exp(-(b0 + b1*x)))
    return p

# sum the log-likelihoods
log_likelihood_fit = 0.0

# for p in patient_data:
#     if p.y == 1.0:
#         log_likelihood_fit += log(logistic_function(p.x))
#     elif p.y == 0.0:
#         log_likelihood_fit += log(1.0 - logistic_function(p.x))

# Consolidating the log likelihood logic into a single line
log_likelihood_fit = sum(log(logistic_function(p.x))*(p.y) + log(1.0 - logistic_function(p.x))*(1.0 - p.y) for p in patient_data)

print(log_likelihood_fit)

In [None]:
# log like_lihood of patients
import pandas as pd
from math import log, exp
patient_data = list(pd.read_csv('https://bit.ly/33ebs2R',
                                delimiter=',').itertuples())

likelihood = sum(p.y for p in patient_data) / len(patient_data)
log_likelihood = 0.0
# for p in patient_data:
#     if p.y == 1.0:
#         log_likelihood += log(likelihood)
#     elif p.y == 0.0:
#         log_likelihood += log(1-likelihood)

# consolidating the log likelihood into a single line
log_likelihood = sum(log(likelihood)*p.y + log(1.0-likelihood)*(1.0 - p.y) for p in patient_data)

print(log_likelihood)

In [None]:
# Calculating the R2 for a logistic Regression
import pandas as pd
from math import log, exp
patient_data = list(pd.read_csv('https://bit.ly/33ebs2R',
                                delimiter=',').itertuples())

# Declare fitted logistic regression
b0 = -3.17576395
b1 = 0.69267212

def logistic_function(x):
    p = 1.0 / (1.0 + exp(-(b0 + b1*x)))
    return p


# Calculate the log likelihood of the fit
log_likelihood_fit = sum(log(logistic_function(p.x))*p.y + \
                         log(1.0-logistic_function(p.x))*(1.0-p.y) \
                         for p in patient_data)

# Calculate R-Squared
# Get an equation for this.
r2 = (log_likelihood - log_likelihood_fit) / log_likelihood
print(r2)

In [None]:
# R2 is too low thus hours of exposure is mediocre for predicting symptoms.

In [None]:
# P-value
# P-value helps investigate how likely we would have seen this data by chance
# rather than because of actual relationship

In [None]:
# chi-Square distribution, X2 is a probability distribution.

# In our case, X2 = 2(log likelihood fit) - (log likelihood)
# We then take that value and lookup the probability from the X2 distribution.
# p_value = chi(2(log likelihood fit) - (log likelihood))

In [None]:
# Calculating a p-value for a given logistic regression
import pandas as pd
from math import log, exp
from scipy.stats import chi2

patient_data = list(pd.read_csv('https://bit.ly/33ebs2R',
                                delimiter=',').itertuples())

# Declare fitted logistic regression
b0 = -3.17576395
b1 = 0.69267212

def logistic_function(x):
    p = 1.0 / (1.0 + exp(-(b0 + b1*x)))
    return p

# Calculate the log likelihood of the fit
log_likelihood_fit = sum(log(logistic_function(p.x))*p.y +
                         log(1.0 - logistic_function(p.x))*(1.0 - p.y)
                         for p in patient_data)

# Calculate the log likelihood without fit
likelihood = sum(p.y for p in patient_data) / len(patient_data)
log_likelihood = sum(log(likelihood)*p.y +
                     log(1.0 - likelihood) * (1.0-p.y)
                     for p in patient_data)

# calculate p-value
chi2_input = 2*(log_likelihood_fit - log_likelihood)
p_value = chi2.pdf(chi2_input, 1) # 1 degree of freedom (n-1)

print(p_value)

In [None]:
# We have a p-value of 0.000166, and if our threshold for significance is 0.05, we say
# this data is statistically significant and was not by random chance.

<h3>Train/Test Split</h3>

In [None]:
# Train/Test Split
# Three-Fold Cross-validation

In [None]:
# Perform a logistic regression on the employee-retention dataset,
# split the data into thirds. We then alternate each third as the testing data.
# We average the three mean and standard deviation values for each fold

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

# Load the data
df =pd.read_csv('https://tinyurl.com/y6r7qjrp',
                 delimiter=',')

# Inputs
X = df.values[:, :-1]
Y = df.values[:, -1]

# 'random_state' is the random seed, which we fix to 7
kfold = KFold(n_splits=3,
              random_state=7,
              shuffle=True)

model = LogisticRegression(penalty=None)
# results = cross_val_score(
#     model, X, Y, cv=kfold)
# print('Accurancy Mean: % .3f stdev: %.3f'%(results.mean(), results.std()))

# Using AUC value score
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print('AUC: %.3f(%.3f)'%(results.mean(), results.std()))

In [None]:
# Creating a confusion matrix
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv('https://bit.ly/3cManTi',
                 delimiter=',')

# Extract input variables (all rows, all columns but last column)
X = df.values[:, :-1]

# Extract output column (all rows, last column)
Y = df.values[:, -1]

model = LogisticRegression(solver='liblinear')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.33,
                                                    random_state=10,
                                                   # Using stratify to overcome class imbalances in Y columns
                                                    stratify=Y
                                                   )
model.fit(X_train, Y_train)
prediction = model.predict(X_test)

"""
The confusion matrix evaluates accuracy within category
[[truepositives falsenegatives]
 [falsepositives truenegatives]]

The diagonal represents correct predictions, so we want those to be higher
"""
matrix = confusion_matrix(y_true=Y_test,
                          y_pred=prediction)
print(matrix)

In [None]:
# NOTE: Read and learn more about Bayes Theorem and conditional probabilities

<h3>Reciever Operator Characteristics | Area Under Curve</h3>

In [None]:
# RECIEVER OPERATOR CHARACTERISTICS | AREA UNDER CURVE
# When we are evaluating different machine learning configurations, we may end up
# with dozens, hundreds or thousands of confusion matrices

# A Reciever Operator Characteristics (ROC) curve helps us to see each testing
# instance (each represented by a black dot) and find an agreeable balance between
# the positives and false positives

# We can also comapre different machine learning models by creating 
# separate ROC for each.
# The area under the curve (AUC) is a good metric for choosing which model to use.

# CLASS IMBALANCE
# class imbalance happens when data is not equally represented across every outcome class.
# Unfortunatly, many problems of interest are imbalanced, such as disease prediction, security
# breaches, fraud detection, etc

# OVERCOMING CLASS IMBALANCE
# Collect more data
# Try different models as well as confusion matrices, ROC/AUC curves
# Duplicate samples in the minority class until it is equally represented in the dataset. This is also known as data argumentation.

# In Sciktlear, stratify can be used on the imbalanced column to generate
# duplicate samples until the data in that column is balanced.

<h3>Coronary Heart Disease (CHD) and AGE</h3>
<p>It is of interest to explore the relationship between AGE and the
presence (1) or absence (0) of CHD in a group.</p>

In [None]:
dfCHD = pd.read_csv('../Data/CHDdata.csv')
dfCHD.head()

In [None]:
dfCHD.info()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [None]:
sns.scatterplot(dfCHD, x='age', y='chd');
# Plotting a scatter plot

In [None]:
dfCHD['age'].unique().max()

In [None]:
# The column Expected mean provides an estimate of E(chd | age) = b0 + b1*x

dfCHD['AGEGRP'] = pd.cut(dfCHD['age'],
                            bins=[10, 20, 25, 30, 35, 40, 45, 50, 55, 65],
                            right=False,
                            # retbins=True,
                            include_lowest=False,
                           )
dfCHD['Expected Mean'] = dfCHD.groupby(['AGEGRP'],
             observed=True)['chd'].mean(
             ).reset_index(
    
             )['chd']
dfCHD

In [None]:
data2 = dfCHD.groupby(['AGEGRP'], observed=True)['chd'].mean().reset_index()

In [None]:
data2.head()

In [None]:
data3 = dfCHD.groupby(['AGEGRP'], observed=True)['age'].mean().reset_index()
data = data2.merge(data3, how='inner')

In [None]:
data.head()

In [None]:
sns.lineplot(data,
             x='age',
             y='chd',
             markers=True,
            );
sns.despine();
plt.ylabel('Coronary Heart Disease (mean)');
plt.xlabel('Age (Years)');

# It can be seen that the Expected mean approaches zero and 1 gradually.
# The change in the E(Y|x) per unit change in x becomes progressively smaller as the conditional
# mean gets closer to zero or one. This is the nature of a logistic (S-shaped) function.

In [None]:
# Using age as the only independent variable (input)
X = dfCHD['age'].values.reshape((-1, 1))

# Target value
y = dfCHD['chd'].values

# cross validation
kfold = KFold(n_splits=3,
              random_state=7,
              shuffle=True
             )

model = LogisticRegression(
    penalty=None,
    dual=False,
    tol=0.0001,
    C=1.0,
    fit_intercept=True,
    intercept_scaling=1,
    class_weight=None,
    solver='lbfgs',
    max_iter=100,
    multi_class='deprecated',
    verbose=0,
    warm_start=False,
    n_jobs=None,
    l1_ratio=None,
).fit(X, y)

# Using AUC value Score
# results = cross_val_score(model, X, y, cv=kfold,
#                           scoring='roc_auc')
# print('AUC: %.3f (%.3f)'%(results.mean(), results.std()))
model.intercept_.flatten() # array([-3.52170053])
model.coef_.flatten() # 0.06410789

# model.get_params()

In [None]:
# Plot regression
def log_regplot(X, y):
    plt.figure(figsize=(10,5))
    sns.regplot(x=X, y=y,
                logistic=True,
                color='green',
               # ci=None,
                # scatter=True,
               )
    return None

log_regplot(X, y)
sns.despine();
plt.ylabel('Coronary Heart Disease (mean)');
plt.xlabel('Age (Years)');
plt.show()

<h3>Student Dropout and Logistic Regression</h3>

In [80]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [81]:
# Import dataset
df = pd.read_csv('../Data/studentDropout.csv')
df.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance', 'Previous qualification', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Displaced',
       'Educational special needs', 'Debtor', 'Tuition fees up to date',
       'Gender', 'Scholarship holder', 'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)

In [87]:
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,0
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,1
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,0
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,1
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,1


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Nacionality                                     4424 non-null   int64  
 7   Mother's qualification                          4424 non-null   int64  
 8   Father's qualification                          4424 non-null   int64  
 9   Mother's occupation                      

In [83]:
# Target Variable, y
df.Target.unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [86]:
# Encode
from sklearn.preprocessing import OneHotEncoder
df.Target = df.Target.apply(lambda x: 1 if x == 'Graduate' else 0)
df.Target.head()

# encoder = OneHotEncoder()

# # Fit and transform the target variable
# encoded_data = encoder.fit_transform(df[['Target']].reshape(-1, 1))

# # Create a DataFrame from the encoded data
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['Target']))
# encoded_df

# ENCODING
# Hot Encoding
# Label Encoding


0    0
1    1
2    0
3    1
4    1
Name: Target, dtype: int64

In [None]:
# Logistic Regression Model and Accuracy
X = df.drop('Target', axis=1)
y = df.Target

In [None]:
# Split data for training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=.33,
    random_state=42
)

# What are some of the techniques to avoid overfitting?
# CrossValidations, k-fold, random k-fold
# 

In [89]:
# Logistic model
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression(
    max_iter=1000
)
lm.fit(X_train, y_train)

In [91]:
lm.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 1000,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [92]:
# Predictions
predictions = lm.predict(X_test)

In [93]:
# Scores
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.80      0.84       755
           1       0.80      0.88      0.84       705

    accuracy                           0.84      1460
   macro avg       0.84      0.84      0.84      1460
weighted avg       0.84      0.84      0.84      1460



In [None]:
# Principal Component Analysis
# Standardize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)

In [None]:
scaledData = scaler.transform(df)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

In [None]:
pca.fit(scaledData)

In [None]:
componentMatrix = pca.components_
# componentMatrix

In [None]:
firstComponentLoadings = componentMatrix[0, :]
# firstComponentLoadings

In [None]:
influentialColumnsFirstComponent = df.columns[np.abs(firstComponentLoadings).argsort()[::-1]]
# influentialColumnsFirstComponent

In [None]:
secondComponentLoadings = componentMatrix[1, :]
influentialColumnsSecondComponent = df.columns[np.abs(secondComponentLoadings).argsort()[::-1]]
# influentialColumnsSecondComponent

In [None]:
# influentialColumnsFirstComponent
X_pca = pca.transform(scaledData)
# X_pca

In [None]:
scaledData.shape

In [None]:
# plot scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0],
            X_pca[:, 1],
            c=df.Target,
            cmap='plasma',
           )
plt.xlabel('First Principal Component')
plt.ylabel('Second PC')


In [None]:
# pca.components_

In [None]:
dfComp = pd.DataFrame(pca.components_,
                      columns=df.columns.tolist())
plt.figure(figsize=(10, 8))
sns.heatmap(dfComp,
            cmap='plasma',
           )

In [None]:
columns = influentialColumnsFirstComponent[0:5].tolist()
columnsToAdd = influentialColumnsSecondComponent.tolist()

In [None]:
# Columns to add
def addUniqueElements(firstList, secondList):
    for element in firstList:
        if element not in firstList and element != 'Target':
            secondList.append(element)
        if len(secondList) == 10:
            break
    return secondList

In [None]:
finalColumns = addUniqueElements(columnsToAdd, columns)
finalColumns.append('Target')

In [None]:
filteredDf = df[finalColumns]

In [None]:
filteredDf.head()

In [None]:
X_filtered = filteredDf.drop('Target', axis=1)

In [None]:
y_filtered = filteredDf.Target

In [None]:
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
    X_filtered,
    y_filtered,
    test_size=.33,
    random_state=42)


In [None]:
logmodel_f = LogisticRegression()
logmodel_f.fit(
    X_train_f,
    y_train_f
)

In [None]:
# Predictions
predictions_f = logmodel_f.predict(X_test_f)

In [None]:
print(classification_report(y_test_f,
                            predictions_f)
     )