# Pregnancy Outcome Prediction using LinearSVC and Logistic Regression

In [1]:
#Importing necessary libraries

from matplotlib import style
import numpy as np
style.use('ggplot')

from sklearn.svm import LinearSVC

from sklearn import preprocessing

import pandas as pd

In [2]:
#Removing unnecessary columns and cleaning the data for further use in modeling

def scrubData(df):
    
    #Assuming that 1 corresponds to still birth
    dfStill = df[df.outcome_pregnancy == 1]
    print(dfStill.shape)
    #Assuming that 2 corresponds to live birth
    dfLve = df[df.outcome_pregnancy == 2]
    dfSpontArb = df[df.outcome_pregnancy == 4 ]
    
    totalStill = len(dfStill.index)
    totalSpontArb = len(dfSpontArb.index)
    
    totalOK = len(dfLve.index)
    
    dfStillAndLive = dfStill.append(dfLve)
    dAll = dfStillAndLive.append(dfSpontArb)
    print(dAll.shape)
    df = dAll
    
    df = df.sample(frac=1.0)
    
    #Removing unnecessary columns
    df.drop(['w_id','hl_id', 'client_w_id','state','district'], 1, inplace=True)
    df.drop(['psu_id', 'house_no','house_hold_no','year_of_intr', 'month_of_intr', 'date_of_intr'], 1, inplace=True)    
    df.drop(['other_int_code','identifcation_code', 'w_expall_status','w_status','twsi_id', 'client_twsi_id'], 1, inplace=True)
    df.drop(['fid','hh_id', 'client_hh_id','member_identity','father_serial_no', 'mother_serial_no'], 1, inplace=True)
    df.drop(['client_hl_id','building_no', 'hl_expall_status','sn'], 1, inplace=True)
    df.drop(['headname','ever_born', 'wt'], 1, inplace=True)
    df.drop(['fidx','as', 'as_binned'], 1, inplace=True)
    df.drop(['fidh','cdoi', 'anym','catage1','respondentname', 'rtelephoneno'], 1, inplace=True)
    df.drop(['healthscheme_1','healthscheme_2'], 1, inplace=True)
    df.drop(['new_born_alive_female','new_born_alive_male', 'new_born_alive_total','new_surviving_female'], 1, inplace=True)  
    df.drop(['new_surviving_male','new_surviving_total'], 1, inplace=True) 
    df.drop(['isdeadmigrated'], 1, inplace=True)
    df.drop(['isnewrecord','recordupdatedcount', 'recordupdatedcount','schedule_id','year', 'id'], 1, inplace=True)
    df.drop(['date_of_marriage', 'month_of_marriage', 'year_of_marriage'],1, inplace=True)
    df.drop(['year_of_birth', 'month_of_birth', 'date_of_birth'],1, inplace=True)
    df.drop(['compensation_after_ster', 'received_compensation_after_ster', 'received_compensation_ster_rs'],1, inplace=True)
    df.drop(['is_tubectomy','delivered_any_baby','born_alive_female','born_alive_male','born_alive_total'],1, inplace=True)
    df.drop(['surviving_female','surviving_male','surviving_total'],1, inplace=True)
    df.drop(['last_preg_no','previous_last_preg_no','second_last_preg_no', 'third_last_preg_no'],1, inplace=True)
    df.drop(['edt','occupation','marital', 'modern', 'traditional'],1, inplace=True)
    df.drop(['recordstatus'],1, inplace=True)
    df.drop(['hh_serial_no'],1, inplace=True)
    df.drop(['sex'],1, inplace=True)
    df.drop(['religion'],1, inplace=True) 
    df.drop(['currently_attending_school'],1, inplace=True) 
    df.drop(['reason_for_not_attending_school'],1, inplace=True)
    df.drop(['highest_qualification'],1, inplace=True)
    df.drop(['serial_no'],1, inplace=True)
    df.drop(['disability_status'],1, inplace=True)
    df.drop(['occupation_status'],1, inplace=True)
    df.drop(['house_status'],1, inplace=True)
    df.drop(['householdstatus'],1, inplace=True)
    df.drop(['isheadchanged'],1, inplace=True)
    df.drop(['land_possessed'],1, inplace=True)
    df.drop(['cart'],1, inplace=True)
    df.drop(['current_mar_status'],1, inplace=True)
    df.drop(['counselled_for_menstrual_hyg'],1, inplace=True)
    df.drop(['aware_abt_haf'],1, inplace=True)
    df.drop(['aware_abt_ort_ors'],1, inplace=True)
    df.drop(['aware_abt_ort_ors_zinc'],1, inplace=True)
    df.drop(['aware_abt_danger_signs_new_born'],1, inplace=True)  
    df.drop(['residancial_status'],1, inplace=True)  
    df.drop(['iscoveredbyhealthscheme'],1, inplace=True)  
    df.drop(['housestatus'],1, inplace=True)  
    df.drop(['health_prob_afters_fp_use'],1, inplace=True)  
    df.drop(['is_husband_living_with_you'],1, inplace=True)  
    df.drop(['months_of_preg_first_anc'],1, inplace=True)
    df.drop(['age_at_first_conception'],1, inplace=True)
    df.drop(['regular_treatment_source'],1, inplace=True) 
    df.drop(['regular_treatment'],1, inplace=True) 
    df.drop(['diagnosis_source'],1, inplace=True)
    df.drop(['diagnosed_for'],1, inplace=True)
    df.drop(['sought_medical_care'],1, inplace=True)
    df.drop(['symptoms_pertaining_illness'],1, inplace=True)
    df.drop(['treatment_source'],1, inplace=True)
    df.drop(['illness_type'],1, inplace=True)
    df.drop(['injury_treatment_type'],1, inplace=True)
    df.drop(['currently_dead_or_out_migrated'],1, inplace=True)
    df.drop(['twsi_expall_status'],1, inplace=True)
    df.drop(['currently_widow'],1, inplace=True)
    
    
    df.replace('', np.nan)
    
    #Converting to numeric form and filling 0 in place of NaN values
    df.apply(pd.to_numeric, errors='coerce')
    df.fillna(0, inplace=True)
    
    
    df = df[df.result_of_interview == 1]
    df.drop(['result_of_interview'],1, inplace=True)
    
    print(df.shape)
    return df

In [3]:
#Importing the larger dataset on which predictions are to be made 

df2 = pd.read_csv('5.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df2

Unnamed: 0,w_id,hl_id,client_w_id,state,district,rural,stratum_code,psu_id,house_no,house_hold_no,...,anym,respondentname,rtelephoneno,isnewrecord,recordupdatedcount,recordstatus,schedule_id,year,id,v204
0,,,,5,13,1,2,100426933,4,1,...,,,,,1.0,2.0,2,3,1490121,
1,,,,5,13,1,2,100426778,8,1,...,,,,,1.0,2.0,2,3,1490122,
2,,,,5,13,1,2,100427331,11,1,...,,,,,1.0,2.0,2,3,1490123,
3,,,,5,13,1,2,100427695,20,1,...,,,,,1.0,2.0,2,3,1490124,
4,,,,5,13,1,2,100427501,21,1,...,,,,,1.0,2.0,2,3,1490125,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941126,,,,5,2,2,0,100505198,36,1,...,,,,,1.0,2.0,2,2,9023060,
941127,,,,5,2,2,0,100505166,43,1,...,,,,,1.0,2.0,2,2,9023061,
941128,,,,5,2,2,0,100505164,44,1,...,,,,1.0,1.0,2.0,2,2,9023062,
941129,,,,5,2,2,0,100505183,46,1,...,,,,,1.0,2.0,2,2,9023063,


In [5]:
#Generating a smaller dataset from the larger one. On this smaller dataset, the model will be trained.

alpha = df2.sample(frac = 0.02656378)

In [6]:
alpha

Unnamed: 0,w_id,hl_id,client_w_id,state,district,rural,stratum_code,psu_id,house_no,house_hold_no,...,anym,respondentname,rtelephoneno,isnewrecord,recordupdatedcount,recordstatus,schedule_id,year,id,v204
69250,,,,5,6,1,1,100393666,85,1,...,,,,,1.0,2.0,2,3,1345644,
505382,,,,5,8,1,2,100698925,219,2,...,,,,,1.0,2.0,2,2,9204259,
31505,,,,5,5,1,2,100188641,476,1,...,,,,0.0,1.0,2.0,2,3,1333501,
347159,,,,5,4,1,1,100042663,5,2,...,,,,,1.0,2.0,2,2,9101061,
937112,45201.0,71727,,5,5,2,0,100177244,35,1,...,,,,,,,2,1,8872609,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494928,,,,5,7,1,1,100687874,192,1,...,,,,,1.0,2.0,2,2,9170596,
422821,,,,5,5,1,1,100278244,46,1,...,,,,,1.0,2.0,2,2,9141045,
118632,,,,5,4,1,1,100136521,192,1,...,,,,,1.0,2.0,2,3,1269098,
384982,,,,5,5,2,0,100298341,58,1,...,,,,,1.0,2.0,2,2,9129921,


In [7]:
#Running the data cleaning function on the smaller sample dataset

df = scrubData(alpha)

(3570, 202)
(23997, 202)
(23703, 82)


In [8]:
df

Unnamed: 0,rural,stratum_code,age,marital_status,mother_age_when_baby_was_born,outcome_pregnancy,is_vasectomy,is_copper_t,is_pills_daily,is_piils_weekly,...,is_sewing_machine,is_bicycle,is_scooter,is_car,is_tractor,is_water_pump,ever_conceived,no_of_times_conceived,is_injectable_contraceptive,v204
409213,1,1,29,3,20.0,2.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,0.0
247631,2,0,41,3,20.0,2.0,1.0,1.0,1.0,1.0,...,1.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,0.0
475158,1,1,37,3,18.0,2.0,1.0,1.0,1.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,5.0,1.0,0.0
446891,2,0,39,3,26.0,2.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,0.0
904102,2,0,36,3,27.0,2.0,1.0,1.0,1.0,1.0,...,2.0,1.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478520,1,1,33,3,23.0,2.0,1.0,1.0,1.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,4.0,1.0,0.0
320581,1,1,26,3,18.0,2.0,1.0,2.0,1.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,2.0,0.0
374521,1,1,33,3,21.0,2.0,1.0,1.0,1.0,1.0,...,1.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,1.0,0.0
272237,2,0,32,3,21.0,2.0,1.0,1.0,1.0,1.0,...,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,0.0


In [14]:
#Features for model training

X = np.array(df.drop(['outcome_pregnancy'], 1))
print(X.shape)

(23703, 81)


In [15]:
for col in df.columns:
    print(col, df[col].unique())

rural [1 2]
stratum_code [1 0 2]
age [29 41 37 39 36 27 26 47 40 49 22 23 35 28 32 34 31 42 24 19 48 20 33 44
 46 30 43 38 25 45 21 17 18 16 15]
marital_status [3 4 5 7 6]
mother_age_when_baby_was_born [20. 18. 26. 27. 21. 29. 22. 28.  0. 23. 19. 25. 17. 35. 24. 16. 15. 31.
 30. 14. 39. 33. 36. 13. 32. 34. 37. 40. 38. 12. 41.]
outcome_pregnancy [2. 1.]
is_vasectomy [1. 2. 0.]
is_copper_t [1. 2. 0.]
is_pills_daily [1. 2. 0.]
is_piils_weekly [1. 2. 0.]
is_emergency_contraceptive [1. 2. 0.]
is_condom [1. 2. 0.]
is_moder_methods [2. 1. 0.]
is_contraceptive [2. 1. 0.]
is_periodic_abstinence [1. 2. 0.]
is_withdrawal [1. 2. 0.]
is_amenorrahoea [1. 2. 0.]
is_other_traditional_method [2. 1. 0.]
is_currently_pregnant [2. 1. 0. 3.]
pregnant_month [0. 6. 7. 9. 2. 3. 8. 4. 5. 1.]
is_anc_registered [0. 2. 1.]
willing_to_get_pregnant [0. 1. 4. 3. 2.]
is_currently_menstruating [1. 4. 2. 0. 5. 3. 6.]
when_you_bcome_mother_last_time [1. 0. 3. 2. 4.]
is_any_fp_methos_used [1. 0. 2.]
fp_method_used [ 4.  

In [16]:
df.iloc[:,40] = df.iloc[:,40].replace(' ',0)
df.iloc[:,41] = df.iloc[:,41].replace(' ',0)
df.iloc[:,58] = df.iloc[:,58].replace(' ',0)

In [17]:
df.iloc[:,40]

409213    0
247631    0
475158    0
446891    0
904102    0
         ..
478520    0
320581    0
374521    0
272237    0
887441    0
Name: during_pregnancy, Length: 23703, dtype: object

In [18]:
#Standardizing the feature dataset
X = preprocessing.scale(X)

#Prediction variable
y = np.array(df['outcome_pregnancy'])

##  LinearSVC Model

In [19]:
#Generating the object for LinearSVC method and fitting it on the smaller sample dataset for training

clf = LinearSVC(max_iter=90000)
clf.fit(X,y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=90000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [20]:
correct = 0

In [21]:
#Prediction on the training dataset

for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = clf.predict(predict_me)
   
    if prediction[0] == y[i]:
        correct += 1

In [22]:
#Accuracy check

print("result: ", correct/len(X))

result:  0.9105598447453909


In [23]:
#Running the cleaning function on the actual larger dataset

df2 = scrubData(df2)

(135796, 202)
(903180, 202)
(891589, 82)


In [28]:
#Feature dataset

X2 = np.array(df2.drop(['outcome_pregnancy'], 1))
print(X2.shape)

(891589, 81)


In [29]:
for col in df2.columns:
    print(col, df2[col].unique())

rural [1 2]
stratum_code [1 0 2]
age [30 40 27 46 22 42 39 44 32 36 35 31 28 37 29 49 24 47 25 18 41 38 26 21
 43 23 20 34 45 33 48 19 17 15 16]
marital_status [3 5 7 4 6 2]
mother_age_when_baby_was_born [16. 20.  0. 23. 17. 19. 22. 18. 27. 25. 24. 15. 21. 32. 26. 28. 29. 31.
 33. 14. 30. 13. 34. 35. 12. 37. 41. 38. 36. 39. 42. 40. 49. 43. 46. 44.
 48. 45. 47.]
outcome_pregnancy [2. 1.]
is_vasectomy [1. 0. 2.]
is_copper_t [2. 1. 0.]
is_pills_daily [2. 1. 0.]
is_piils_weekly [2. 1. 0.]
is_emergency_contraceptive [2. 1. 0.]
is_condom [1. 2. 0.]
is_moder_methods [2. 1. 0.]
is_contraceptive [2. 1. 0.]
is_periodic_abstinence [2. 1. 0.]
is_withdrawal [2. 1. 0.]
is_amenorrahoea [2. 1. 0.]
is_other_traditional_method [2. 1. 0.]
is_currently_pregnant [2. 1. 0. 3.]
pregnant_month [0. 9. 6. 7. 5. 4. 2. 3. 1. 8.]
is_anc_registered [0. 1. 2.]
willing_to_get_pregnant [0. 1. 2. 4. 3.]
is_currently_menstruating [1. 2. 0. 4. 5. 3. 6.]
when_you_bcome_mother_last_time [1. 0. 2. 4. 3.]
is_any_fp_methos_us

In [30]:
df2.iloc[:,40] = df2.iloc[:,40].replace(' ',0)
df2.iloc[:,41] = df2.iloc[:,41].replace(' ',0)
df2.iloc[:,58] = df2.iloc[:,58].replace(' ',0)

In [31]:
#Standardizing the feature dataset
X2 = preprocessing.scale(X2)

#Prediction variable
y2 = np.array(df2['outcome_pregnancy'])

In [32]:
correct = 0

In [33]:
#Prediction (Larger dataset)

for i in range(len(X2)):
    predict_me = np.array(X2[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = clf.predict(predict_me)
   
    if prediction[0] == y2[i]:
        correct += 1

In [34]:
#Accuracy check

print("2nd result: ", correct/len(X2))

2nd result:  0.9102961117734741


## Logistic Regression Model

In [40]:
#Importing Logistic Regression

from sklearn.linear_model import LogisticRegression

In [42]:
#Generating the object for Logistic Regression method and fitting it on the smaller sample dataset for training

log_reg = LogisticRegression()
log_reg.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
#Prediction on the training dataset

correct2 = 0

for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = log_reg.predict(predict_me)
   
    if prediction[0] == y[i]:
        correct2 += 1

In [44]:
#Accuracy Check

print("1st result for logistic model: ", correct2/len(X))

1st result for logistic model:  0.9118676960722272


In [48]:
#Prediction on the larger dataset

correct2 = 0

for i in range(len(X2)):
    predict_me = np.array(X2[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = log_reg.predict(predict_me)
   
    if prediction[0] == y2[i]:
        correct2 += 1

In [50]:
#Accuracy Check

print("2nd result for logistic model: ", correct2/len(X2))

2nd result for logistic model:  0.9120244866188345
