In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import pickle

In [2]:
dataset = pd.read_excel('./divorce.xlsx')

In [3]:
X = dataset.iloc[:,:-1].values / 4.0
Y = dataset.iloc[:,-1].values

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size = 0.1)

In [5]:
classifier = LogisticRegression(solver='lbfgs')

In [6]:
classifier.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
Y_prediction = classifier.predict(X_test)

In [8]:
Y_prediction

array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1], dtype=int64)

In [9]:
Y_test

array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1], dtype=int64)

In [10]:
classifier.score(X_train, Y_train)

0.9803921568627451

In [11]:
classifier.score(X_test, Y_test)

1.0

In [12]:
print('Test set results: ')
Y_prediction_proba = classifier.predict_proba(X_test)
for i in range(17):
    print(f"probability: {round(Y_prediction_proba[i][1], 3)}\tprediction: {Y_prediction[i]}\treal: {Y_test[i]}")
print(f"Prediction score: {classifier.score(X_test, Y_test)}")

Test set results: 
probability: 0.022	prediction: 0	real: 0
probability: 1.0	prediction: 1	real: 1
probability: 0.031	prediction: 0	real: 0
probability: 0.045	prediction: 0	real: 0
probability: 0.014	prediction: 0	real: 0
probability: 0.047	prediction: 0	real: 0
probability: 1.0	prediction: 1	real: 1
probability: 0.033	prediction: 0	real: 0
probability: 0.036	prediction: 0	real: 0
probability: 0.03	prediction: 0	real: 0
probability: 0.049	prediction: 0	real: 0
probability: 1.0	prediction: 1	real: 1
probability: 0.026	prediction: 0	real: 0
probability: 1.0	prediction: 1	real: 1
probability: 1.0	prediction: 1	real: 1
probability: 0.006	prediction: 0	real: 0
probability: 1.0	prediction: 1	real: 1
Prediction score: 1.0


In [13]:
classifier.coef_

array([[ 0.45108249,  0.42319767,  0.58420285,  0.24624905,  0.24595966,
         0.50712978,  0.05511807,  0.20903475,  0.25657284,  0.13259142,
         0.39195488,  0.28254399, -0.02218604,  0.38401127,  0.49916152,
         0.29478715,  0.53695084,  0.52758467,  0.42711962,  0.47143002,
         0.17066095, -0.00178435,  0.01716318,  0.03456551,  0.25837033,
         0.68580587,  0.22073559,  0.45439583,  0.31365272,  0.34996988,
         0.5395662 ,  0.26744902,  0.29354868,  0.33788263,  0.13428324,
         0.32761248,  0.18081928,  0.39059526,  0.57704369,  0.95452147,
         0.42506077,  0.24079544,  0.21219657,  0.68544836,  0.0914232 ,
         0.07345375,  0.05686638, -0.08125813,  0.66463504,  0.26190911,
        -0.0083323 ,  0.39756421,  0.41743334,  0.0488967 ]])

In [14]:
coefs = abs(classifier.coef_[0])

In [15]:
questions_to_remove = coefs.argsort()[:24]
print(questions_to_remove)

[21 50 22 12 23 53  6 46 45 47 44  9 34 20 36  7 42 26 41  4  3  8 24 49]


In [16]:
dataset2 = dataset.copy()
for question in questions_to_remove:
    dataset2 = dataset2.drop(f"Atr{question + 1}", axis=1)

In [17]:
X2 = dataset2.iloc[:,:-1].values / 4.0
Y2 = dataset2.iloc[:,-1].values

In [21]:
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size=0.1)

In [22]:
Y2_train.shape

(153,)

In [23]:
classifier2 = LogisticRegression(solver='lbfgs')
classifier2.fit(X2_train, Y2_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
Y2_prediction = classifier2.predict(X2_test)

In [26]:
classifier2.score(X2_train, Y2_train)

0.9803921568627451

In [27]:
classifier2.score(X2_test, Y2_test)

0.9411764705882353

In [31]:
print("Test set results:")
Y2_prediction_proba = classifier2.predict_proba(X2_test)
for i in range(17):
    print(f"probability: {round(Y2_prediction_proba[i][1], 3)}\tprediction: {Y2_prediction[i]}\treal: {Y2_test[i]}")
print(f"Prediction score: {classifier2.score(X2_test, Y2_test)}")

Test set results:
probability: 0.019	prediction: 0	real: 0
probability: 1.0	prediction: 1	real: 1
probability: 0.024	prediction: 0	real: 0
probability: 0.017	prediction: 0	real: 0
probability: 0.999	prediction: 1	real: 1
probability: 0.023	prediction: 0	real: 0
probability: 1.0	prediction: 1	real: 1
probability: 0.014	prediction: 0	real: 0
probability: 0.067	prediction: 0	real: 0
probability: 0.083	prediction: 0	real: 0
probability: 0.999	prediction: 1	real: 1
probability: 0.063	prediction: 0	real: 0
probability: 0.013	prediction: 0	real: 0
probability: 0.361	prediction: 0	real: 1
probability: 0.014	prediction: 0	real: 0
probability: 1.0	prediction: 1	real: 1
probability: 0.035	prediction: 0	real: 0
Prediction score: 0.9411764705882353


In [32]:
questions_to_remove = coefs.argsort()[:44]
print(questions_to_remove)

[21 50 22 12 23 53  6 46 45 47 44  9 34 20 36  7 42 26 41  4  3  8 24 49
 31 11 32 15 28 35 33 29 13 37 10 51 52  1 40 18  0 27 19 14]


In [33]:
dataset3 = dataset.copy()
for question in questions_to_remove:
    dataset3 = dataset3.drop(f"Atr{question + 1}", axis=1)

In [34]:
X3 = dataset3.iloc[:, :-1].values / 4.0
Y3 = dataset3.iloc[:, -1].values
X3_train, X3_test, Y3_train, Y3_test = train_test_split(X3, Y3, test_size=0.1)
classifier3 = LogisticRegression(solver='lbfgs')
classifier3.fit(X3_train, Y3_train)
y3_prediction = classifier3.predict(X3_test)

In [35]:
classifier3.score(X3_train, Y3_train)


0.9934640522875817

In [36]:
print("Test set results:")
y3_prediction_proba = classifier3.predict_proba(X3_test)
for i in range(17):
    print(f"probability: {round(y3_prediction_proba[i][1], 3)}\tprediction: {y3_prediction[i]}\treal: {Y3_test[i]}")
print(f"Prediction score: {classifier3.score(X3_test, Y3_test)}")

Test set results:
probability: 0.993	prediction: 1	real: 1
probability: 0.044	prediction: 0	real: 0
probability: 0.03	prediction: 0	real: 0
probability: 0.993	prediction: 1	real: 1
probability: 0.986	prediction: 1	real: 1
probability: 0.997	prediction: 1	real: 1
probability: 0.994	prediction: 1	real: 1
probability: 0.016	prediction: 0	real: 0
probability: 0.053	prediction: 0	real: 0
probability: 0.96	prediction: 1	real: 1
probability: 0.986	prediction: 1	real: 1
probability: 0.041	prediction: 0	real: 0
probability: 0.993	prediction: 1	real: 1
probability: 0.986	prediction: 1	real: 1
probability: 0.052	prediction: 0	real: 0
probability: 0.995	prediction: 1	real: 1
probability: 0.021	prediction: 0	real: 0
Prediction score: 1.0


In [37]:
dataset3

Unnamed: 0,Atr3,Atr6,Atr17,Atr18,Atr26,Atr31,Atr39,Atr40,Atr44,Atr49,Class
0,4,0,0,0,0,1,3,3,2,3,1
1,4,0,4,4,1,0,2,4,4,4,1
2,2,3,3,3,2,3,3,3,3,1,1
3,3,3,3,3,1,2,4,4,2,3,1
4,1,1,1,1,2,1,2,1,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...
165,0,0,0,0,0,4,1,0,0,1,0
166,0,0,0,0,0,0,1,1,1,2,0
167,0,0,0,0,1,1,1,2,0,0,0
168,0,0,0,0,0,0,4,1,0,2,0
