In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import preprocessing

In [74]:
df = pd.read_csv("Medical Costs/insurance_filtered.csv")

In [4]:
from rpy2.robjects.packages import importr
utils = importr("utils")

In [37]:
import rpy2.ipython
%load_ext rpy2.ipython
import warnings
warnings.filterwarnings('ignore')

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [38]:
%%R
library(dplyr)
library(ggplot2)
raw_data <- read.csv("./COMPASS Scores/compas-scores-two-years-violent.csv")
nrow(raw_data)

[1] 4743


In [39]:
%%R
df <- dplyr::select(raw_data, age, c_charge_degree, race, age_cat, score_text, sex, priors_count, 
                    days_b_screening_arrest, decile_score, is_recid, two_year_recid, c_jail_in, c_jail_out) %>% 
        filter(days_b_screening_arrest <= 30) %>%
        filter(days_b_screening_arrest >= -30) %>%
        filter(is_recid != -1) %>%
        filter(c_charge_degree != "O") %>%
        filter(score_text != 'N/A')
nrow(df)
write.table(df,"compas-scores-two-years-violent-filtered.csv",sep=";")


In [11]:
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()

In [58]:
df = df.dropna()

In [32]:
df.shape

(6172, 13)

In [66]:
df.head()

Unnamed: 0,age_under_27,age_under_39,age_under_51,age_51_more,sex,bmi_under_26,bmi_under_30,bmi_under_34,bmi_under_53_more,children_covered_0,children_covered_1,children_covered_2,children_covered_3_more,smoker,charges_Low,charges_High
0,1,0,0,0,female,0,1,0,0,1,0,0,0,yes,0,1
1,1,0,0,0,male,0,0,1,0,0,1,0,0,no,1,0
2,0,1,0,0,male,0,0,1,0,0,0,0,1,no,1,0
3,0,1,0,0,male,1,0,0,0,1,0,0,0,no,0,1
4,0,1,0,0,male,0,1,0,0,1,0,0,0,no,1,0


In [34]:
df = df.drop(["Unnamed: 0","Employee Name"],axis=1)

In [76]:
df["sex"].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [75]:
df.describe()

Unnamed: 0,age_under_27,age_under_39,age_under_51,age_51_more,bmi_under_26,bmi_under_30,bmi_under_34,bmi_under_53_more,children_covered_0,children_covered_1,children_covered_2,children_covered_3_more,charges_Low,charges_High
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,0.270553,0.233184,0.251868,0.244395,0.250374,0.252616,0.246637,0.250374,0.428999,0.242152,0.179372,0.149477,0.5,0.5
std,0.444412,0.423016,0.434248,0.429888,0.43339,0.434675,0.431215,0.43339,0.495118,0.428546,0.383807,0.356691,0.500187,0.500187
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5
75%,1.0,0.0,1.0,0.0,0.75,1.0,0.0,0.75,1.0,0.0,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [51]:
import time

In [52]:
time.strftime('%H:%M:%S', time.gmtime(60))

'00:01:00'

In [67]:
categorical = df.select_dtypes(include=['object']).columns.tolist()
categorical

['sex', 'smoker']

In [68]:
#Deep copy the original data
data_encoded = df.copy(deep=True)
#Use Scikit-learn label encoding to encode character data
lab_enc = preprocessing.LabelEncoder()
for col in categorical:
        data_encoded[col] = lab_enc.fit_transform(df[col])
        le_name_mapping = dict(zip(lab_enc.classes_, lab_enc.transform(lab_enc.classes_)))
        print('Feature', col)
        print('mapping', le_name_mapping)

Feature sex
mapping {'female': 0, 'male': 1}
Feature smoker
mapping {'no': 0, 'yes': 1}


In [69]:
# Create new dataframe with dummy features
categorical_feats = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_feats)

['sex', 'smoker']


In [70]:
df = pd.get_dummies(df, columns= categorical_feats)

In [71]:
df.head()

Unnamed: 0,age_under_27,age_under_39,age_under_51,age_51_more,bmi_under_26,bmi_under_30,bmi_under_34,bmi_under_53_more,children_covered_0,children_covered_1,children_covered_2,children_covered_3_more,charges_Low,charges_High,sex_female,sex_male,smoker_no,smoker_yes
0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1
1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,1,0
2,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0
3,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,0
4,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0


In [72]:
df.to_csv("Insurance_Filtered_Binary.csv",index=None,sep=";")