In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [12]:
df = pd.read_csv("data/FedCycleData.csv")
df.sample(5)

Unnamed: 0,ClientID,CycleNumber,Group,CycleWithPeakorNot,ReproductiveCategory,LengthofCycle,MeanCycleLength,EstimatedDayofOvulation,LengthofLutealPhase,FirstDayofHigh,...,Method,Prevmethod,Methoddate,Whychart,Nextpreg,NextpregM,Spousesame,SpousesameM,Timeattemptpreg,BMI
239,nfp8058,10,1,1,0,30,,15,15,11,...,,,,,,,,,,
762,nfp8168,9,0,1,0,29,,16,13,13,...,,,,,,,,,,
42,nfp8122,43,0,1,0,28,,15,13,8,...,,,,,,,,,,
1063,nfp8233,4,1,1,1,29,,25,4,18,...,,,,,,,,,,
851,nfp8177,4,0,1,0,25,,11,14,8,...,,,,,,,,,,


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1665 entries, 0 to 1664
Data columns (total 80 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   ClientID                    1665 non-null   object
 1   CycleNumber                 1665 non-null   int64 
 2   Group                       1665 non-null   int64 
 3   CycleWithPeakorNot          1665 non-null   int64 
 4   ReproductiveCategory        1665 non-null   int64 
 5   LengthofCycle               1665 non-null   int64 
 6   MeanCycleLength             1665 non-null   object
 7   EstimatedDayofOvulation     1665 non-null   object
 8   LengthofLutealPhase         1665 non-null   object
 9   FirstDayofHigh              1665 non-null   object
 10  TotalNumberofHighDays       1665 non-null   object
 11  TotalHighPostPeak           1665 non-null   object
 12  TotalNumberofPeakDays       1665 non-null   object
 13  TotalDaysofFertility        1665 non-null   obje

In [20]:
print("Total null values",sum(df.isna().sum()))
df.isna().sum()

Total null values 0


ClientID                0
CycleNumber             0
Group                   0
CycleWithPeakorNot      0
ReproductiveCategory    0
                       ..
NextpregM               0
Spousesame              0
SpousesameM             0
Timeattemptpreg         0
BMI                     0
Length: 80, dtype: int64

## Data types of each column

In [24]:
types = df.dtypes
columns = df.columns

list(zip(types, columns))

[(dtype('O'), 'ClientID'),
 (dtype('int64'), 'CycleNumber'),
 (dtype('int64'), 'Group'),
 (dtype('int64'), 'CycleWithPeakorNot'),
 (dtype('int64'), 'ReproductiveCategory'),
 (dtype('int64'), 'LengthofCycle'),
 (dtype('O'), 'MeanCycleLength'),
 (dtype('O'), 'EstimatedDayofOvulation'),
 (dtype('O'), 'LengthofLutealPhase'),
 (dtype('O'), 'FirstDayofHigh'),
 (dtype('O'), 'TotalNumberofHighDays'),
 (dtype('O'), 'TotalHighPostPeak'),
 (dtype('O'), 'TotalNumberofPeakDays'),
 (dtype('O'), 'TotalDaysofFertility'),
 (dtype('O'), 'TotalFertilityFormula'),
 (dtype('O'), 'LengthofMenses'),
 (dtype('O'), 'MeanMensesLength'),
 (dtype('O'), 'MensesScoreDayOne'),
 (dtype('O'), 'MensesScoreDayTwo'),
 (dtype('O'), 'MensesScoreDayThree'),
 (dtype('O'), 'MensesScoreDayFour'),
 (dtype('O'), 'MensesScoreDayFive'),
 (dtype('O'), 'MensesScoreDaySix'),
 (dtype('O'), 'MensesScoreDaySeven'),
 (dtype('O'), 'MensesScoreDayEight'),
 (dtype('O'), 'MensesScoreDayNine'),
 (dtype('O'), 'MensesScoreDayTen'),
 (dtype('O')

## BMI
There are lots of missing values in a form of empty string `" "`.

In [26]:
# Replace the empty string with NaN
df['BMI'] = df['BMI'].replace(" ", np.nan)
df['BMI'] = df['BMI'].astype(float) # ValueError: could not convert string to float: ' '

In [29]:
df.BMI.isnull().sum() / len(df)

0.9213213213213213

__92% null values in BMI column__



# Replace all the `" "` values with nan

In [32]:
df = df.replace(" ", np.nan)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1665 entries, 0 to 1664
Data columns (total 80 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ClientID                    1665 non-null   object 
 1   CycleNumber                 1665 non-null   int64  
 2   Group                       1665 non-null   int64  
 3   CycleWithPeakorNot          1665 non-null   int64  
 4   ReproductiveCategory        1665 non-null   int64  
 5   LengthofCycle               1665 non-null   int64  
 6   MeanCycleLength             141 non-null    object 
 7   EstimatedDayofOvulation     1515 non-null   object 
 8   LengthofLutealPhase         1514 non-null   object 
 9   FirstDayofHigh              1407 non-null   object 
 10  TotalNumberofHighDays       1653 non-null   object 
 11  TotalHighPostPeak           1662 non-null   object 
 12  TotalNumberofPeakDays       1649 non-null   object 
 13  TotalDaysofFertility        1634 

## List of feature which has missing values not more than 60%

In [58]:
null_percentages = df.isnull().mean() * 100
selected_features = null_percentages[null_percentages <= 90].index.tolist()
selected_df = df[selected_features]
selected_df.shape

(1665, 26)

In [63]:
selected_df.isnull().mean()

ClientID                      0.000000
CycleNumber                   0.000000
Group                         0.000000
CycleWithPeakorNot            0.000000
ReproductiveCategory          0.000000
LengthofCycle                 0.000000
EstimatedDayofOvulation       0.090090
LengthofLutealPhase           0.090691
FirstDayofHigh                0.154955
TotalNumberofHighDays         0.007207
TotalHighPostPeak             0.001802
TotalNumberofPeakDays         0.009610
TotalDaysofFertility          0.018619
TotalFertilityFormula         0.001201
LengthofMenses                0.002402
MensesScoreDayOne             0.002402
MensesScoreDayTwo             0.002402
MensesScoreDayThree           0.015015
MensesScoreDayFour            0.052252
MensesScoreDayFive            0.260661
MensesScoreDaySix             0.637838
MensesScoreDaySeven           0.865465
TotalMensesScore              0.002402
NumberofDaysofIntercourse     0.000601
IntercourseInFertileWindow    0.000601
UnusualBleeding          