In [1]:
# Problem Statement- creating a new binary variable affair(did the woman have at least one affair?) and trying to predict the classification for each woman.

In [2]:
# Importing all the packages

In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
dta = sm.datasets.fair.load_pandas().data

In [4]:
# add "affair" column: 1 represents having affairs, 0 represents not
dta['affair'] = (dta.affairs > 0).astype(int)
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + \
religious + educ + C(occupation) + C(occupation_husb)',
dta, return_type="dataframe")
X = X.rename(columns = {'C(occupation)[T.2.0]':'occ_2',
'C(occupation)[T.3.0]':'occ_3',
'C(occupation)[T.4.0]':'occ_4',
'C(occupation)[T.5.0]':'occ_5',
'C(occupation)[T.6.0]':'occ_6',
'C(occupation_husb)[T.2.0]':'occ_husb_2',
'C(occupation_husb)[T.3.0]':'occ_husb_3',
'C(occupation_husb)[T.4.0]':'occ_husb_4',
'C(occupation_husb)[T.5.0]':'occ_husb_5',
'C(occupation_husb)[T.6.0]':'occ_husb_6'})
y = np.ravel(y)

In [5]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)



In [6]:
# check the accuracy on the training set
print('')
print('Accuracy of model is')
print(round(model.score(X, y)*100))


Accuracy of model is
73.0


In [7]:
# what percentage had affairs?
print('')
print('Percentage of woman having affair')
print(round(y.mean()*100))


Percentage of woman having affair
32.0


In [8]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0
)
model2 = LogisticRegression()
model2.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
# predict class labels for the test set
predicted = model2.predict(X_test)
print('')
print('predict class labels for the test set')
print(predicted)


predict class labels for the test set
[1. 0. 0. ... 0. 0. 0.]


In [11]:
# generate class probabilities
probs = model2.predict_proba(X_test)
print('')
print('generate class probabilities')
print(probs)


generate class probabilities
[[0.3514634  0.6485366 ]
 [0.90955084 0.09044916]
 [0.72567333 0.27432667]
 ...
 [0.55727385 0.44272615]
 [0.81207043 0.18792957]
 [0.74734601 0.25265399]]


In [12]:
# generate evaluation metrics
print('Accuracy score of matrix')
print(metrics.accuracy_score(y_test, predicted))
print('ROC and AUC score of matrix')
print(metrics.roc_auc_score(y_test, probs[:, 1]))
print('Confusion matrix value')
print(metrics.confusion_matrix(y_test, predicted))
print('Classification report value')
print(metrics.classification_report(y_test, predicted))

Accuracy score of matrix
0.7298429319371728
ROC and AUC score of matrix
0.745950606950631
Confusion matrix value
[[1169  134]
 [ 382  225]]
Classification report value
              precision    recall  f1-score   support

         0.0       0.75      0.90      0.82      1303
         1.0       0.63      0.37      0.47       607

   micro avg       0.73      0.73      0.73      1910
   macro avg       0.69      0.63      0.64      1910
weighted avg       0.71      0.73      0.71      1910



In [13]:
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print('Score using 10 fold')
print(scores)
print('Mean Score using 10 fold')
print(scores.mean())



Score using 10 fold
[0.72100313 0.70219436 0.73824451 0.70597484 0.70597484 0.72955975
 0.7327044  0.70440252 0.75157233 0.75      ]
Mean Score using 10 fold
0.7241630685514876
