In [None]:
#IMPORTANT RUN AT START
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection as skl_ms
from sklearn import metrics

getting data

In [None]:
url = 'https://raw.githubusercontent.com/Ari-vu/SML/main/Given_data/train.csv'
df = pd.read_csv(url)

Splitting test and train data sets

In [None]:
# split train and test data
  # percentage of training data to be used in training set
perc = 0.70
  #setting random state for consistency
random_state = 10
np.random.seed(random_state) # 
  #using numpy to randomy select 70% of total data for training data set
trainIndex = np.random.choice(df.shape[0], size=int(perc*df.shape[0]), replace=False)
train = df.iloc[trainIndex]
  #remaining data assigned to test set
test = df.iloc[~df.index.isin(trainIndex)]

In [None]:
#splitting X and Y from test and train sets
  #train data splitting
X_train = train.drop(columns=['Lead'])
Y_train = train['Lead']
  #reshaping y to be accepted by fit operator
Y_train_np = np.ravel(Y_train,order='C')
  #test data splitting
X_test = test.drop(columns=['Lead'])
Y_test = test['Lead']

preprocessing data by Normalising

In [None]:
from sklearn.preprocessing import MinMaxScaler

norm = MinMaxScaler().fit(X_train)
X_train_norm = norm.transform(X_train)
X_test_norm = norm.transform(X_test)

In [None]:
#Choosing best solver 
  #appending solver algorithms to be tested to iterable
solver=[]

solver.append(LogisticRegression(solver='newton-cg', random_state=1,max_iter= 200))
solver.append(LogisticRegression(solver='lbfgs', random_state=1,max_iter= 200))
solver.append(LogisticRegression(solver='liblinear', random_state=1,max_iter= 200))
solver.append(LogisticRegression(solver='sag', random_state=1,max_iter= 200))
solver.append(LogisticRegression(solver='saga', random_state=1,max_iter= 200))

In [None]:
#solving and testing accuracy of algorithms
for i in range((np.shape(solver)[0])):
  solve = solver[i]
  solve.fit(X_train_norm,Y_train_np)
  Y_pred=solve.predict(X_test_norm)
  print(solver[i])
  acc = metrics.accuracy_score(Y_test, Y_pred)
  print(acc)


LogisticRegression(max_iter=200, random_state=1, solver='newton-cg')
0.8205128205128205
LogisticRegression(max_iter=200, random_state=1)
0.8205128205128205
LogisticRegression(max_iter=200, random_state=1, solver='liblinear')
0.8237179487179487
LogisticRegression(max_iter=200, random_state=1, solver='sag')
0.8205128205128205
LogisticRegression(max_iter=200, random_state=1, solver='saga')
0.8205128205128205


Logistic Regression algorithm Liblinear is chosen as it has highest accuracy

Tuning hyperparameters for Liblinear

In [None]:
# setting parameters to be tuned by gridsearch
  # liblinear supports l1 and l2 regularisation
penalty = ['l1','l2']
  # strength of regularisation to be used
C = np.linspace(0.1, 300, 100)
  # intercept term
intercept = [True,False]

  #setting up variables for gridsearch
grid_logReg ={'penalty': penalty, 'C': C, 'fit_intercept': intercept}

solver = LogisticRegression(max_iter=200, random_state=1, solver='liblinear')


In [None]:
# Creating and fitting logistic regression grid
model = skl_ms.GridSearchCV(estimator=solver, param_grid = grid_logReg )

model.fit(X_train_norm,Y_train_np)

GridSearchCV(estimator=LogisticRegression(max_iter=200, random_state=1,
                                          solver='liblinear'),
             param_grid={'C': array([1.00000000e-01, 3.12929293e+00, 6.15858586e+00, 9.18787879e+00,
       1.22171717e+01, 1.52464646e+01, 1.82757576e+01, 2.13050505e+01,
       2.43343434e+01, 2.73636364e+01, 3.03929293e+01, 3.34222222e+01,
       3.64515152e+01, 3.94808081e+01, 4.25101010e+01, 4.55393939e+...
       2.42443434e+02, 2.45472727e+02, 2.48502020e+02, 2.51531313e+02,
       2.54560606e+02, 2.57589899e+02, 2.60619192e+02, 2.63648485e+02,
       2.66677778e+02, 2.69707071e+02, 2.72736364e+02, 2.75765657e+02,
       2.78794949e+02, 2.81824242e+02, 2.84853535e+02, 2.87882828e+02,
       2.90912121e+02, 2.93941414e+02, 2.96970707e+02, 3.00000000e+02]),
                         'fit_intercept': [True, False],
                         'penalty': ['l1', 'l2']})

In [None]:
# ouput best parameters . best score is accuracy 
print('Best Score :', model.best_score_)
print('Best Parameters :',model.best_params_)

Best Score : 0.8707132735002361
Best Parameters : {'C': 72.8030303030303, 'fit_intercept': True, 'penalty': 'l1'}


In [None]:
# setting parameter variables using results from gridsearch
para_intercept = model.best_params_['fit_intercept']
para_penalty = model.best_params_['penalty']

investigating further tuning options

In [None]:
# refining C parameter value based in n-1 and n+1 range 
C2 = np.linspace(69,75, 50)
# intercept scaling parameter defaul =1 
int_scale = np.linspace(0.1,2, 20)
# setting up variables for new gridsearch
grid_logReg_2 ={'C': C2, 'intercept_scaling': int_scale}
solver_2 = LogisticRegression(max_iter=200, random_state=1, solver='liblinear', penalty='l1', fit_intercept= True)


In [None]:
# Creating and fitting logistic regression grid
model_2 = skl_ms.GridSearchCV(estimator=solver_2, param_grid = grid_logReg_2 )

model_2.fit(X_train_norm,Y_train_np)

GridSearchCV(estimator=LogisticRegression(max_iter=200, penalty='l1',
                                          random_state=1, solver='liblinear'),
             param_grid={'C': array([69.        , 69.12244898, 69.24489796, 69.36734694, 69.48979592,
       69.6122449 , 69.73469388, 69.85714286, 69.97959184, 70.10204082,
       70.2244898 , 70.34693878, 70.46938776, 70.59183673, 70.71428571,
       70.83673469, 70.95918367, 71.08163265, 71.20408163, 71.326...
       72.06122449, 72.18367347, 72.30612245, 72.42857143, 72.55102041,
       72.67346939, 72.79591837, 72.91836735, 73.04081633, 73.16326531,
       73.28571429, 73.40816327, 73.53061224, 73.65306122, 73.7755102 ,
       73.89795918, 74.02040816, 74.14285714, 74.26530612, 74.3877551 ,
       74.51020408, 74.63265306, 74.75510204, 74.87755102, 75.        ]),
                         'intercept_scaling': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ])})

In [None]:
# ouput best parameters
print('Best Score :', model_2.best_score_)
print('Best Parameters :',model_2.best_params_)

Best Score : 0.8707132735002361
Best Parameters : {'C': 70.59183673469387, 'intercept_scaling': 0.8999999999999999}


In [None]:
# setting parameter variables using results from gridsearch
para_C = model_2.best_params_['C']
para_int_scale = model_2.best_params_['intercept_scaling']

Predicting 'Lead' column from model

In [None]:
# using optimised parameter values to define new model for using on test data
final_solver = LogisticRegression(max_iter=200, random_state=1, solver='liblinear', penalty=para_penalty, fit_intercept= para_intercept , C = para_C )

In [None]:
#fitting and predicting value of 'Lead' for test data
final_solver.fit(X_train_norm,Y_train_np)

Y_pred=final_solver.predict(X_test_norm)

Getting confusion matrix

In [None]:
cnf_matrix = metrics.confusion_matrix(Y_test, Y_pred)
print(cnf_matrix)

[[ 44  26]
 [ 12 230]]


calculating accuracy

In [None]:
acc = metrics.accuracy_score(Y_test, Y_pred)
print(acc)

0.8782051282051282
