In [7]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import pickle

In [2]:
df = pd.read_csv('data/fertility.csv', header=0, names=['season', 'age', 'child_disease', 'trauma', 'surgical', 'fevers',
                                                           'alcohol', 'smoking', 'sitting_hours', 'fertility'])
df.head()

Unnamed: 0,season,age,child_disease,trauma,surgical,fevers,alcohol,smoking,sitting_hours,fertility
0,spring,30,no,yes,yes,more than 3 months ago,once a week,occasional,16,Normal
1,spring,35,yes,no,yes,more than 3 months ago,once a week,daily,6,Altered
2,spring,27,yes,no,no,more than 3 months ago,hardly ever or never,never,9,Normal
3,spring,32,no,yes,yes,more than 3 months ago,hardly ever or never,never,7,Normal
4,spring,30,yes,yes,no,more than 3 months ago,once a week,never,9,Altered


In [3]:
'''
ATTRIBUTE INFORMATION

Season in which the analysis was performed. 1) winter, 2) spring, 3) Summer, 4) fall. (-1, -0.33, 0.33, 1) #OH
Age at the time of analysis. 18-36
Childish diseases (ie , chicken pox, measles, mumps, polio) 1) yes, 2) no. (0, 1) #OH
Accident or serious trauma 1) yes, 2) no. (0, 1) #)H
Surgical intervention 1) yes, 2) no. (0, 1) #OH
High fevers in the last year 1) less than three months ago, 2) more than three months ago, 3) no. (-1, 0, 1) #label
Frequency of alcohol consumption 1) several times a day, 2) every day, 3) several times a week, 4) once a week, 
            5) hardly ever or never (0, 1) 
Smoking habit 1) never, 2) occasional 3) daily. (-1, 0, 1)
Number of hours spent sitting per day ene-16 (0, 1)
Output: Diagnosis normal (N), altered (O)
'''

'\nATTRIBUTE INFORMATION\n\nSeason in which the analysis was performed. 1) winter, 2) spring, 3) Summer, 4) fall. (-1, -0.33, 0.33, 1) #OH\nAge at the time of analysis. 18-36\nChildish diseases (ie , chicken pox, measles, mumps, polio) 1) yes, 2) no. (0, 1) #OH\nAccident or serious trauma 1) yes, 2) no. (0, 1) #)H\nSurgical intervention 1) yes, 2) no. (0, 1) #OH\nHigh fevers in the last year 1) less than three months ago, 2) more than three months ago, 3) no. (-1, 0, 1) #label\nFrequency of alcohol consumption 1) several times a day, 2) every day, 3) several times a week, 4) once a week, \n            5) hardly ever or never (0, 1) \nSmoking habit 1) never, 2) occasional 3) daily. (-1, 0, 1)\nNumber of hours spent sitting per day ene-16 (0, 1)\nOutput: Diagnosis normal (N), altered (O)\n'

In [3]:
df.drop(['season', 'age', 'fevers'], axis=1, inplace=True)
df.head()

Unnamed: 0,child_disease,trauma,surgical,alcohol,smoking,sitting_hours,fertility
0,no,yes,yes,once a week,occasional,16,Normal
1,yes,no,yes,once a week,daily,6,Altered
2,yes,no,no,hardly ever or never,never,9,Normal
3,no,yes,yes,hardly ever or never,never,7,Normal
4,yes,yes,no,once a week,never,9,Altered


In [None]:
cat_col = df.drop('sitting_hours', axis=1)

for col in cat_col:
    sns.countplot(y=cat_col[col])
    plt.title('{}'.format(col))
    plt.show()

In [9]:
for col in ['child_disease', 'trauma', 'surgical']:
    print(df[col].value_counts())
    print('-' * 20)

yes    87
no     13
Name: child_disease, dtype: int64
--------------------
no     56
yes    44
Name: trauma, dtype: int64
--------------------
yes    51
no     49
Name: surgical, dtype: int64
--------------------


In [10]:
# label encoding for ordinal features (manually)

df['alcohol'].replace(['every day', 'several times a day', 'several times a week', 'once a week', 'hardly ever or never'],
                      [0, 1, 2, 3, 4], inplace=True)
df['smoking'].replace(['daily', 'occasional', 'never'],
                      [0, 1, 2], inplace=True)
for col in ['child_disease', 'trauma', 'surgical']:
    df[col].replace(['no', 'yes'], [0, 1], inplace=True)

In [11]:
df.head()

Unnamed: 0,child_disease,trauma,surgical,alcohol,smoking,sitting_hours,fertility
0,0,1,1,3,1,16,Normal
1,1,0,1,3,0,6,Altered
2,1,0,0,4,2,9,Normal
3,0,1,1,4,2,7,Normal
4,1,1,0,3,2,9,Altered


In [13]:
X = df.drop('fertility', axis=1)
y = df['fertility']

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18)

In [15]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=18)
model.fit(X_train, y_train)

LogisticRegression(random_state=18)

In [16]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('The model accuracy is {}'.format(accuracy_score(y_test, y_pred)))

[[ 0  2]
 [ 0 18]]
The model accuracy is 0.9


In [17]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

precision = precision_score(y_test, y_pred, average='binary', pos_label='Normal')
recall = recall_score(y_test, y_pred, average='binary', pos_label='Normal')

print('the precision score is {}'.format(precision))
print('the recall score is {}'.format(recall))

the precision score is 0.9
the recall score is 1.0


In [14]:
model_name = 'logreg_model.pkl'
pickle.dump(model, open(model_name))

In [42]:
model = pickle.load(open("logreg_model.pkl", "rb"))
a = [[0, 0, 0, 0, 0, 11], [1, 2, 3, 1, 0, 11]]
print('KLASIFIKASI')
print(model.predict(np.array(a[0]).reshape(1, -1)))
print('PELUANG')
peluang_normal = round(model.predict_proba((np.array(a[0]).reshape(1, -1)))[0][0] * 100, 2)
print(peluang_normal)
print('You Have {} Probabilities of Having a Altered Fertility'.format(peluang_normal))

KLASIFIKASI
['Altered']
PELUANG
57.18
You Have 57.18 Probabilities of Having a Altered Fertility
