In [1]:
import numpy as np
import pandas as pd
import xlrd
# Read the divorce dataset downloaded from http://archive.ics.uci.edu/ml/datasets/Divorce+Predictors+data+set
df = pd.read_csv("divorce_data/divorce.csv")
df

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
0,2,2,4,1,0,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,3,4,4,4,4,2,2,1
2,2,2,2,2,1,3,2,1,1,2,...,3,2,3,1,1,1,2,2,2,1
3,3,2,3,2,3,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,2,2,1,1,1,1,0,0,0,0,...,2,1,2,3,2,2,2,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,0,0,0,0,0,0,0,0,0,0,...,1,0,4,1,1,4,2,2,2,0
166,0,0,0,0,0,0,0,0,0,0,...,4,1,2,2,2,2,3,2,2,0
167,1,1,0,0,0,0,0,0,0,1,...,3,0,2,0,1,1,3,0,0,0
168,0,0,0,0,0,0,0,0,0,0,...,3,3,2,2,3,2,4,3,1,0


In [2]:
# The values have various ranges. Normalizing Dataset so that the values are distributed between 0 to 1.
X = df.values[:,0:54]
Y = df.values[:,54]
standard_deviation = np.std(X,axis = 0)
mean = np.mean(X,axis = 0)
X = X-mean/standard_deviation
standard_deviation.shape

(54,)

In [3]:
# Split the Divirce dataset Into Training And Testing Datasets to help determine the accuracy of the model later.
# X values are the attribute values (answers to the 54 questions)
# Y values are the divorce outcome (Yes/No or 1/0) provided in the 55th column in the Divorce dataset
# training values are the training dataset values
# testing values are for testing datasets values
#
# X_training - Attribute values in the training dataset
# Y_training - Divorce outcomes for the training dataset
# X_testing - Attribute values in the testing dataset
# Y_testing - Divorce outcomes for the testing dataset

from sklearn.model_selection import train_test_split
X_training, X_testing, y_training, y_testing = train_test_split( X, Y, test_size=0.25, random_state=42)

In [4]:
# Logistic Regression Algorithm
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(random_state=0,solver = "liblinear")
# Create Model
lg.fit(X_training,y_training)
# Predict outcome using the Attribute values from the testing dataset
y_predict = lg.predict(X_testing)
# Compute Accuracy of the model
print("Accuracy = ",((np.sum(y_predict==y_testing)/y_testing.shape[0])*100),"%",sep="")


Accuracy = 100.0%


In [5]:
print (y_testing)

[0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1
 0 0 0 0 0 1]


In [6]:
print (y_predict)

[0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1
 0 0 0 0 0 1]


In [7]:
# Naive Bayes Algorithm: Using the training and testing datasets, find the accuracy of the algorithm
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_training, y_training)
y_predict = nb.predict(X_testing)
print("Model Accuracy = ",((np.sum(y_predict==y_testing)/y_testing.shape[0])*100),"%",sep="")

Model Accuracy = 95.34883720930233%


In [8]:
print(y_predict)

[0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 0 1
 0 0 0 0 0 1]


In [9]:
# K-Nearest Neighbors (KNN) Algorithm: Using the training and testing datasets, rind the accuracy of the algorithm
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 15)
knn.fit(X_training, y_training)
y_predict = knn.predict(X_testing)
print("Accuracy (K-Nearest Neighbors (KKN) Algorithm) = ",((np.sum(y_predict==y_testing)/y_testing.shape[0])*100),"%",sep="")

Accuracy (K-Nearest Neighbors (KKN) Algorithm) = 97.67441860465115%


In [10]:
# Decision Tree Algorithm: Using the training and testing datasets, rind the accuracy of the algorithm
from sklearn.tree import DecisionTreeClassifier
decisiontree = DecisionTreeClassifier(max_depth = 10, random_state = 101, max_features = None, min_samples_leaf = 15)
decisiontree.fit(X_training,y_training)
y_predict = decisiontree.predict(X_testing)
print("Accuracy (Decision Tree Algorithm) = ",((np.sum(y_predict==y_testing)/y_testing.shape[0])*100),"%",sep="")

Accuracy (Decision Tree Algorithm) = 95.34883720930233%


In [11]:
# Random Forest Algorithm: Using the training and testing datasets, rind the accuracy of the algorithm
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(n_estimators = 70, oob_score = True, n_jobs = -1, random_state = 101, max_features = None, min_samples_leaf = 30)
randomforest.fit(X_training, y_training)
y_predict = randomforest.predict(X_testing)
print("Accuracy (Rabdom Forest Classifier Algorithm) = ",((np.sum(y_predict==y_testing)/y_testing.shape[0])*100),"%",sep="")

Accuracy (Rabdom Forest Classifier Algorithm) = 97.67441860465115%


In [12]:
# Support Vector Machine (SVM) Algorithm: Using the training and testing datasets, rind the accuracy of the algorithm
from sklearn.svm import SVC
svm = SVC(kernel = "linear", C = 0.025, random_state = 101)
svm.fit(X_training,y_training)
y_predict = svm.predict(X_testing)
print("Accuracy (Support Vector Machine (SVM) Algorithm) = ",((np.sum(y_predict==y_testing)/y_testing.shape[0])*100),"%",sep="")

Accuracy (Support Vector Machine (SVM) Algorithm) = 97.67441860465115%


In [13]:
# Neural Networks Algorithm: Using the training and testing datasets, rind the accuracy of the algorithm
from keras.models import Sequential
from keras.layers import Dense
neuralnet = Sequential()
neuralnet.add(Dense(128, activation = "relu", input_dim=X_training.shape[1]))
neuralnet.add(Dense(64, activation = "relu"))
neuralnet.add(Dense(32, activation = "relu"))
neuralnet.add(Dense(1, activation = "sigmoid"))
neuralnet.summary()
neuralnet.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
neuralnet.fit(X_training, y_training, epochs=10, batch_size=2000)
accuracy = neuralnet.evaluate(X_testing, y_testing)[1]
print('Accuracy (Neural Network Algorithm): %.2f' % (accuracy*100))

Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               7040      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 17,409
Trainable params: 17,409
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy (Neural Network Algorithm): 100.00


In [14]:
# Further enhancement: Rank the algorithms according to their accuracy metrics.
#
# Compute the Confusion Matrix based on the actual and predicted divorce outcomes
# Find the accuracy score taking the actual and predicted values
# Generate a Classification report
#
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
actual = y_testing
predicted = y_predict 
results = confusion_matrix(actual, predicted) 
print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(actual, predicted)) 
print('Report : ')
print(classification_report(actual, predicted)) 

Confusion Matrix :
[[20  0]
 [ 1 22]]
Accuracy Score : 0.9767441860465116
Report : 
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       1.00      0.96      0.98        23

    accuracy                           0.98        43
   macro avg       0.98      0.98      0.98        43
weighted avg       0.98      0.98      0.98        43



In [15]:
# PRODUCTION Divorce Prediction
#
# Predict Divorce outcome for one subject based on the production values entered on the website.
#
# A new user goes to the Divorce Prediction webpage and answers the questionaire of 54 questions.
# The HTML5 code of the webpage writes the values into a CSV file. This CSV file read here and the
# Divorce outcome is predicted based on the production values provided.
#

# Read Production Data of one subject
pdf = pd.read_csv("divorce_data/production.csv")
pdf

Unnamed: 0,1Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
0,2,0,0,0,0,0,0,0,0,0,...,2,1,0,3,3,2,3,2,1,0


In [16]:
# Normalizing Production Data and splitting it into Attributes and Outcome
Xp = pdf.values[:,0:54]
Yp = pdf.values[:,54]

In [17]:
# Since Logistic Regression Algorithm provided the highest accuracy (100%), we will use it to predict production data
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(random_state=0,solver = "liblinear")
lg.fit(X_training,y_training)
y_predict = lg.predict(Xp)
print("Prediction (0 - not divorced, 1 - divorced) = ", y_predict)
print("Actual Divorced or not (0 - not divorced, 1 - divorced) = ", Yp)
print("Accuracy (Logistic Regression Algorithm) = ",((np.sum(y_predict==Yp)/y_testing.shape[0])*100),"%",sep="")

Prediction (0 - not divorced, 1 - divorced) =  [1]
Actual Divorced or not (0 - not divorced, 1 - divorced) =  [0]
Accuracy (Logistic Regression Algorithm) = 0.0%
