In [1]:
import pandas as pd

In [2]:
df_contraceptive = pd.read_csv('C:/Users/shama/Documents/Thesis/implementation/data/contraceptive/cmc.data', names=['wage', 'weducation', 'heducation', 'numborn', 'wismuslim', 'wwork', 'hocc', 'sol', 'good_media_exposure', 'contraceptive'])
df_contraceptive

Unnamed: 0,wage,weducation,heducation,numborn,wismuslim,wwork,hocc,sol,good_media_exposure,contraceptive
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1
...,...,...,...,...,...,...,...,...,...,...
1468,33,4,4,2,1,0,2,4,0,3
1469,33,4,4,3,1,1,1,4,0,3
1470,39,3,3,8,1,0,1,4,0,3
1471,33,3,3,4,1,0,2,2,0,3


# Translate relevant columns to boolean

In [3]:
df_contraceptive['wismuslim'] = df_contraceptive['wismuslim'].astype('bool')
df_contraceptive['wwork'] = ~df_contraceptive['wwork'].astype('bool')
df_contraceptive['good_media_exposure'] = ~df_contraceptive['good_media_exposure'].astype('bool')

In [4]:
df_contraceptive

Unnamed: 0,wage,weducation,heducation,numborn,wismuslim,wwork,hocc,sol,good_media_exposure,contraceptive
0,24,2,3,3,True,False,2,3,True,1
1,45,1,3,10,True,False,3,4,True,1
2,43,2,3,7,True,False,3,4,True,1
3,42,3,2,9,True,False,3,3,True,1
4,36,3,3,8,True,False,3,2,True,1
...,...,...,...,...,...,...,...,...,...,...
1468,33,4,4,2,True,True,2,4,True,3
1469,33,4,4,3,True,False,1,4,True,3
1470,39,3,3,8,True,True,1,4,True,3
1471,33,3,3,4,True,True,2,2,True,3


# Now we need the categorical features that are presented as integers to be translated to categories, i.e. strings. The relevant features are: weducation, heducation, hocc, sol, contraceptive

### For weducation, heducation, sol the same mapping is used so we do those first

In [5]:
mapping = {1: "low", 2: "low-moderate", 3: "moderate-high", 4: "high"}

df_contraceptive.replace({'weducation': mapping, 'heducation': mapping, 'sol': mapping}, inplace=True)

In [6]:
df_contraceptive

Unnamed: 0,wage,weducation,heducation,numborn,wismuslim,wwork,hocc,sol,good_media_exposure,contraceptive
0,24,low-moderate,moderate-high,3,True,False,2,moderate-high,True,1
1,45,low,moderate-high,10,True,False,3,high,True,1
2,43,low-moderate,moderate-high,7,True,False,3,high,True,1
3,42,moderate-high,low-moderate,9,True,False,3,moderate-high,True,1
4,36,moderate-high,moderate-high,8,True,False,3,low-moderate,True,1
...,...,...,...,...,...,...,...,...,...,...
1468,33,high,high,2,True,True,2,high,True,3
1469,33,high,high,3,True,False,1,high,True,3
1470,39,moderate-high,moderate-high,8,True,True,1,high,True,3
1471,33,moderate-high,moderate-high,4,True,True,2,low-moderate,True,3


# Now I am not sure what the 1, 2, 3, 4 categories in husband occupation (hocc) are supposed to mean. We want to leave them in (?) so we map them to their string equivalents: {1: "one", 2: "two", etc...}

In [7]:
mapping = {1: "one", 2: "two", 3: "three", 4: "four"}

df_contraceptive.replace({'hocc': mapping}, inplace=True)

In [8]:
df_contraceptive

Unnamed: 0,wage,weducation,heducation,numborn,wismuslim,wwork,hocc,sol,good_media_exposure,contraceptive
0,24,low-moderate,moderate-high,3,True,False,two,moderate-high,True,1
1,45,low,moderate-high,10,True,False,three,high,True,1
2,43,low-moderate,moderate-high,7,True,False,three,high,True,1
3,42,moderate-high,low-moderate,9,True,False,three,moderate-high,True,1
4,36,moderate-high,moderate-high,8,True,False,three,low-moderate,True,1
...,...,...,...,...,...,...,...,...,...,...
1468,33,high,high,2,True,True,two,high,True,3
1469,33,high,high,3,True,False,one,high,True,3
1470,39,moderate-high,moderate-high,8,True,True,one,high,True,3
1471,33,moderate-high,moderate-high,4,True,True,two,low-moderate,True,3


# Lastly we map contraceptive, where 1 = no-use, 2 = long-term, 3 = short-term

In [9]:
mapping = {1: "no-use", 2: "long-term", 3: "short-term"}

df_contraceptive.replace({'contraceptive': mapping}, inplace=True)

In [10]:
df_contraceptive

Unnamed: 0,wage,weducation,heducation,numborn,wismuslim,wwork,hocc,sol,good_media_exposure,contraceptive
0,24,low-moderate,moderate-high,3,True,False,two,moderate-high,True,no-use
1,45,low,moderate-high,10,True,False,three,high,True,no-use
2,43,low-moderate,moderate-high,7,True,False,three,high,True,no-use
3,42,moderate-high,low-moderate,9,True,False,three,moderate-high,True,no-use
4,36,moderate-high,moderate-high,8,True,False,three,low-moderate,True,no-use
...,...,...,...,...,...,...,...,...,...,...
1468,33,high,high,2,True,True,two,high,True,short-term
1469,33,high,high,3,True,False,one,high,True,short-term
1470,39,moderate-high,moderate-high,8,True,True,one,high,True,short-term
1471,33,moderate-high,moderate-high,4,True,True,two,low-moderate,True,short-term


In [11]:
features = ['weducation', 'heducation', 'wismuslim', 'wwork', 'hocc', 'sol', 'good_media_exposure', 'contraceptive']
targets = ['contraceptive', 'wismuslim']
features = [element for element in features if element not in targets] 
features

['weducation', 'heducation', 'wwork', 'hocc', 'sol', 'good_media_exposure']

# Save to CSV

In [12]:
df_contraceptive.to_csv('C:/Users/shama/PycharmProjects/thesis/data/contraceptive_02_oct.csv')