Congratulations – you have been hired as Chief Data Scientist of MedCamp – a not for profit organization dedicated in making health conditions for working professionals better. MedCamp was started because the founders saw their family suffer due to bad work life balance and neglected health.

MedCamp organizes health camps in several cities with low work life balance. They reach out to working people and ask them to register for these health camps. For those who attend, MedCamp provides them facility to undergo health checks or increase awareness by visiting various stalls (depending on the format of camp). 

MedCamp has conducted 65 such events over a period of 4 years and they see a high drop off between “Registration” and Number of people taking tests at the Camps. In last 4 years, they have stored data of ~110,000 registrations they have done.

One of the huge costs in arranging these camps is the amount of inventory you need to carry. If you carry more than required inventory, you incur unnecessarily high costs. On the other hand, if you carry less than required inventory for conducting these medical checks, people end up having bad experience.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [3]:
train = pd.read_csv('Train.csv')

fhc = pd.read_csv('First_Health_Camp_Attended.csv')

shc = pd.read_csv('Second_Health_Camp_Attended.csv')

thc = pd.read_csv('Third_Health_Camp_Attended.csv')

pp  = pd.read_csv('Patient_Profile.csv')

hc = pd.read_csv('Health_Camp_Detail.csv')

test = pd.read_csv('test_l0Auv8Q.csv')

In [4]:
train.columns,hc.columns

(Index(['Patient_ID', 'Health_Camp_ID', 'Registration_Date', 'Var1', 'Var2',
        'Var3', 'Var4', 'Var5'],
       dtype='object'),
 Index(['Health_Camp_ID', 'Camp_Start_Date', 'Camp_End_Date', 'Category1',
        'Category2', 'Category3'],
       dtype='object'))

In [5]:
# merge the train and test first and then rest of the data
combined = pd.concat([train, test], ignore_index = True)

# Combine all the dataset together and make it one train set
combined = pd.merge(combined, pp, on = ["Patient_ID"], 
                    how = "left")

combined = pd.merge(combined, fhc, on = ['Patient_ID', 
                                        'Health_Camp_ID'], 
                    how = "left")

combined = pd.merge(combined, shc, on = ['Patient_ID', 
                                        'Health_Camp_ID'], 
                    how = "left")

combined = pd.merge(combined, thc, on = ['Patient_ID', 
                                        'Health_Camp_ID'], 
                    how = "left")

combined = pd.merge(combined, hc, on = ['Health_Camp_ID'], 
                    how = "left")

In [6]:
pd.set_option("display.max_columns", 50)
combined.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category,Donation,Health_Score,Unnamed: 4,Health Score,Number_of_stall_visited,Last_Stall_Visited_Number,Camp_Start_Date,Camp_End_Date,Category1,Category2,Category3
0,489652,6578,10-Sep-05,4,0,0,0,2,0,0,0,0,,,,06-Dec-04,,,,,,,2.0,1.0,16-Aug-05,14-Oct-05,Third,G,2
1,507246,6578,18-Aug-05,45,5,0,0,7,0,0,0,0,1.0,75.0,40.0,08-Sep-04,C,Others,,,,,,,16-Aug-05,14-Oct-05,Third,G,2
2,523729,6534,29-Apr-06,0,0,0,0,0,0,0,0,0,,,,22-Jun-04,,,,,,0.402054,,,17-Oct-05,07-Nov-07,Second,A,2
3,524931,6535,07-Feb-04,0,0,0,0,0,0,0,0,0,,,,07-Feb-04,I,,,,,,,,01-Feb-04,18-Feb-04,First,E,2
4,521364,6529,28-Feb-06,15,1,0,0,7,0,0,0,1,1.0,70.0,40.0,04-Jul-03,I,Technology,,,,0.845597,,,30-Mar-06,03-Apr-06,Second,A,2


In [7]:
combined["Registration_Date"] = pd.to_datetime(combined.Registration_Date, 
                                               dayfirst=True)

combined["First_Interaction"] = pd.to_datetime(combined.First_Interaction, 
                                               dayfirst=True)


In [8]:
combined["Camp_Start_Date"] = pd.to_datetime(combined.Camp_Start_Date, 
                                               dayfirst=True)

combined["Camp_End_Date"] = pd.to_datetime(combined.Camp_End_Date, 
                                               dayfirst=True)

In [9]:
# Difference between the Registration date and FI

# days before interaction

combined["days_before_int"] = combined["Registration_Date"] - \
combined["First_Interaction"]

In [10]:
combined["days_before_int"] = combined.days_before_int.dt.days

In [11]:
# Camp Start Date - Camp End Date...

combined["Camp_Duration"] = combined["Camp_End_Date"]-\
combined["Camp_Start_Date"]

In [12]:
combined["Camp_Duration"] = combined.Camp_Duration.dt.days

In [13]:
combined["Reg_Camp_Start"] = (combined.Registration_Date - 
                              combined.Camp_Start_Date).dt.days

In [14]:
combined["Reg_Camp_End"] = (combined.Registration_Date - 
                              combined.Camp_End_Date).dt.days

In [15]:
combined.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category,Donation,Health_Score,Unnamed: 4,Health Score,Number_of_stall_visited,Last_Stall_Visited_Number,Camp_Start_Date,Camp_End_Date,Category1,Category2,Category3,days_before_int,Camp_Duration,Reg_Camp_Start,Reg_Camp_End
0,489652,6578,2005-09-10,4,0,0,0,2,0,0,0,0,,,,2004-12-06,,,,,,,2.0,1.0,2005-08-16,2005-10-14,Third,G,2,278.0,59,25.0,-34.0
1,507246,6578,2005-08-18,45,5,0,0,7,0,0,0,0,1.0,75.0,40.0,2004-09-08,C,Others,,,,,,,2005-08-16,2005-10-14,Third,G,2,344.0,59,2.0,-57.0
2,523729,6534,2006-04-29,0,0,0,0,0,0,0,0,0,,,,2004-06-22,,,,,,0.402054,,,2005-10-17,2007-11-07,Second,A,2,676.0,751,194.0,-557.0
3,524931,6535,2004-02-07,0,0,0,0,0,0,0,0,0,,,,2004-02-07,I,,,,,,,,2004-02-01,2004-02-18,First,E,2,0.0,17,6.0,-11.0
4,521364,6529,2006-02-28,15,1,0,0,7,0,0,0,1,1.0,70.0,40.0,2003-07-04,I,Technology,,,,0.845597,,,2006-03-30,2006-04-03,Second,A,2,970.0,4,-30.0,-34.0


In [16]:
combined["Reg_Date"] = combined.Registration_Date.dt.day
combined["Reg_Month"] = combined.Registration_Date.dt.month
combined["Reg_Year"] = combined.Registration_Date.dt.year

In [17]:
# Count of Patients registered everyday
combined["PP_Day"] = combined.groupby(["Patient_ID"])\
["Reg_Date"].transform("nunique")

combined["PP_Month"] = combined.groupby(["Patient_ID"])\
["Reg_Month"].transform("nunique")

combined["PP_Year"] = combined.groupby(["Patient_ID"])\
["Reg_Year"].transform("nunique")

In [18]:
# No of Patients in every health camp
combined["PP_HealthCamp"]=  combined.groupby(["Health_Camp_ID"])\
["Patient_ID"].transform("nunique")

In [19]:
# First Interaction Year

combined["First_Int_Year"] = combined.First_Interaction.dt.year
combined["Camp_Start_Year"] = combined.Camp_Start_Date.dt.year
combined["Camp_End_Year"] = combined.Camp_End_Date.dt.year

In [20]:
#pd.DataFrame((combined.Camp_End_Year-combined.Camp_Start_Year)).describe()
combined.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category,Donation,Health_Score,Unnamed: 4,Health Score,Number_of_stall_visited,Last_Stall_Visited_Number,Camp_Start_Date,Camp_End_Date,Category1,Category2,Category3,days_before_int,Camp_Duration,Reg_Camp_Start,Reg_Camp_End,Reg_Date,Reg_Month,Reg_Year,PP_Day,PP_Month,PP_Year,PP_HealthCamp,First_Int_Year,Camp_Start_Year,Camp_End_Year
0,489652,6578,2005-09-10,4,0,0,0,2,0,0,0,0,,,,2004-12-06,,,,,,,2.0,1.0,2005-08-16,2005-10-14,Third,G,2,278.0,59,25.0,-34.0,10.0,9.0,2005.0,9,7,3,2837,2004,2005,2005
1,507246,6578,2005-08-18,45,5,0,0,7,0,0,0,0,1.0,75.0,40.0,2004-09-08,C,Others,,,,,,,2005-08-16,2005-10-14,Third,G,2,344.0,59,2.0,-57.0,18.0,8.0,2005.0,16,12,4,2837,2004,2005,2005
2,523729,6534,2006-04-29,0,0,0,0,0,0,0,0,0,,,,2004-06-22,,,,,,0.402054,,,2005-10-17,2007-11-07,Second,A,2,676.0,751,194.0,-557.0,29.0,4.0,2006.0,5,4,2,3597,2004,2005,2007
3,524931,6535,2004-02-07,0,0,0,0,0,0,0,0,0,,,,2004-02-07,I,,,,,,,,2004-02-01,2004-02-18,First,E,2,0.0,17,6.0,-11.0,7.0,2.0,2004.0,4,4,3,1882,2004,2004,2004
4,521364,6529,2006-02-28,15,1,0,0,7,0,0,0,1,1.0,70.0,40.0,2003-07-04,I,Technology,,,,0.845597,,,2006-03-30,2006-04-03,Second,A,2,970.0,4,-30.0,-34.0,28.0,2.0,2006.0,17,9,4,3823,2003,2006,2006


In [21]:
mapped = {'First':1, "Second":2, "Third": 3}

combined["Category1"] = combined.Category1.map(mapped)

In [22]:
combined["Category2"]=pd.factorize(combined["Category2"])[0]

In [23]:
combined.loc[combined.Employer_Category.isnull(),
             'Employer_Category'] = \
combined.Employer_Category.mode()[0]

In [24]:
combined["Employer_Category"]=\
pd.factorize(combined.Employer_Category)[0]

In [25]:
combined["City_Type"]=\
pd.factorize(combined.City_Type)[0]

In [26]:
# Repalce Income, Age and Edu Score from None to np.nan

combined["Income"].replace(to_replace = "None", 
                          value = np.NaN, inplace = True)

combined["Education_Score"].replace(to_replace = "None", 
                          value = np.NaN, inplace = True)

combined["Age"].replace(to_replace = "None", 
                          value = np.NaN, inplace = True)

In [27]:
combined.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category,Donation,Health_Score,Unnamed: 4,Health Score,Number_of_stall_visited,Last_Stall_Visited_Number,Camp_Start_Date,Camp_End_Date,Category1,Category2,Category3,days_before_int,Camp_Duration,Reg_Camp_Start,Reg_Camp_End,Reg_Date,Reg_Month,Reg_Year,PP_Day,PP_Month,PP_Year,PP_HealthCamp,First_Int_Year,Camp_Start_Year,Camp_End_Year
0,489652,6578,2005-09-10,4,0,0,0,2,0,0,0,0,,,,2004-12-06,-1,0,,,,,2.0,1.0,2005-08-16,2005-10-14,3,0,2,278.0,59,25.0,-34.0,10.0,9.0,2005.0,9,7,3,2837,2004,2005,2005
1,507246,6578,2005-08-18,45,5,0,0,7,0,0,0,0,1.0,75.0,40.0,2004-09-08,0,1,,,,,,,2005-08-16,2005-10-14,3,0,2,344.0,59,2.0,-57.0,18.0,8.0,2005.0,16,12,4,2837,2004,2005,2005
2,523729,6534,2006-04-29,0,0,0,0,0,0,0,0,0,,,,2004-06-22,-1,0,,,,0.402054,,,2005-10-17,2007-11-07,2,1,2,676.0,751,194.0,-557.0,29.0,4.0,2006.0,5,4,2,3597,2004,2005,2007
3,524931,6535,2004-02-07,0,0,0,0,0,0,0,0,0,,,,2004-02-07,1,0,,,,,,,2004-02-01,2004-02-18,1,2,2,0.0,17,6.0,-11.0,7.0,2.0,2004.0,4,4,3,1882,2004,2004,2004
4,521364,6529,2006-02-28,15,1,0,0,7,0,0,0,1,1.0,70.0,40.0,2003-07-04,1,0,,,,0.845597,,,2006-03-30,2006-04-03,2,1,2,970.0,4,-30.0,-34.0,28.0,2.0,2006.0,17,9,4,3823,2003,2006,2006


In [28]:
# Combine the Online Activity

combined["Online_Activity"] = combined["Online_Follower"]+\
combined["LinkedIn_Shared"]+combined["Twitter_Shared"]+\
combined["Facebook_Shared"]

In [29]:
# Drop Donation and Unnamed 4 along with Social Media
combined.drop(['Online_Follower', 'LinkedIn_Shared',
       'Twitter_Shared', 'Facebook_Shared','Unnamed: 4',
              "Donation"],axis = 1, inplace = True)

In [30]:
combined.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category,Health_Score,Health Score,Number_of_stall_visited,Last_Stall_Visited_Number,Camp_Start_Date,Camp_End_Date,Category1,Category2,Category3,days_before_int,Camp_Duration,Reg_Camp_Start,Reg_Camp_End,Reg_Date,Reg_Month,Reg_Year,PP_Day,PP_Month,PP_Year,PP_HealthCamp,First_Int_Year,Camp_Start_Year,Camp_End_Year,Online_Activity
0,489652,6578,2005-09-10,4,0,0,0,2,,,,2004-12-06,-1,0,,,2.0,1.0,2005-08-16,2005-10-14,3,0,2,278.0,59,25.0,-34.0,10.0,9.0,2005.0,9,7,3,2837,2004,2005,2005,0
1,507246,6578,2005-08-18,45,5,0,0,7,1.0,75.0,40.0,2004-09-08,0,1,,,,,2005-08-16,2005-10-14,3,0,2,344.0,59,2.0,-57.0,18.0,8.0,2005.0,16,12,4,2837,2004,2005,2005,0
2,523729,6534,2006-04-29,0,0,0,0,0,,,,2004-06-22,-1,0,,0.402054,,,2005-10-17,2007-11-07,2,1,2,676.0,751,194.0,-557.0,29.0,4.0,2006.0,5,4,2,3597,2004,2005,2007,0
3,524931,6535,2004-02-07,0,0,0,0,0,,,,2004-02-07,1,0,,,,,2004-02-01,2004-02-18,1,2,2,0.0,17,6.0,-11.0,7.0,2.0,2004.0,4,4,3,1882,2004,2004,2004,0
4,521364,6529,2006-02-28,15,1,0,0,7,1.0,70.0,40.0,2003-07-04,1,0,,0.845597,,,2006-03-30,2006-04-03,2,1,2,970.0,4,-30.0,-34.0,28.0,2.0,2006.0,17,9,4,3823,2003,2006,2006,1


In [31]:
def tgt(a, b, c, d):
    if((a>0) | (b>0) | (c>0) | (d>0)):
        return(1)
    else:
        return(0)        

In [32]:
combined["target"] = combined.apply(lambda x:tgt(x['Health_Score'],
                           x['Health Score'],
                           x['Number_of_stall_visited'],
                           x['Last_Stall_Visited_Number']),
              axis = 1)

In [33]:
combined.drop(["Health_Score", "Health Score", 
               "Number_of_stall_visited",
              "Last_Stall_Visited_Number"], axis = 1, inplace = True)

In [34]:
combined.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category,Camp_Start_Date,Camp_End_Date,Category1,Category2,Category3,days_before_int,Camp_Duration,Reg_Camp_Start,Reg_Camp_End,Reg_Date,Reg_Month,Reg_Year,PP_Day,PP_Month,PP_Year,PP_HealthCamp,First_Int_Year,Camp_Start_Year,Camp_End_Year,Online_Activity,target
0,489652,6578,2005-09-10,4,0,0,0,2,,,,2004-12-06,-1,0,2005-08-16,2005-10-14,3,0,2,278.0,59,25.0,-34.0,10.0,9.0,2005.0,9,7,3,2837,2004,2005,2005,0,1
1,507246,6578,2005-08-18,45,5,0,0,7,1.0,75.0,40.0,2004-09-08,0,1,2005-08-16,2005-10-14,3,0,2,344.0,59,2.0,-57.0,18.0,8.0,2005.0,16,12,4,2837,2004,2005,2005,0,0
2,523729,6534,2006-04-29,0,0,0,0,0,,,,2004-06-22,-1,0,2005-10-17,2007-11-07,2,1,2,676.0,751,194.0,-557.0,29.0,4.0,2006.0,5,4,2,3597,2004,2005,2007,0,1
3,524931,6535,2004-02-07,0,0,0,0,0,,,,2004-02-07,1,0,2004-02-01,2004-02-18,1,2,2,0.0,17,6.0,-11.0,7.0,2.0,2004.0,4,4,3,1882,2004,2004,2004,0,0
4,521364,6529,2006-02-28,15,1,0,0,7,1.0,70.0,40.0,2003-07-04,1,0,2006-03-30,2006-04-03,2,1,2,970.0,4,-30.0,-34.0,28.0,2.0,2006.0,17,9,4,3823,2003,2006,2006,1,1


In [35]:
# Impute -999 in the missing value...

combined.loc[combined.Income.isnull(), "Income"] = -999
combined.loc[combined.Education_Score.isnull(), 
             "Education_Score"] = -999
combined.loc[combined.Age.isnull(), 
             "Age"] = -999

In [36]:
combined.isnull().sum()[combined.isnull().sum()!=0]

Registration_Date    334
days_before_int      334
Reg_Camp_Start       334
Reg_Camp_End         334
Reg_Date             334
Reg_Month            334
Reg_Year             334
dtype: int64

In [37]:
combined.loc[combined.days_before_int.isnull(), 
             'days_before_int']= combined.days_before_int.median()

combined.loc[combined.Reg_Camp_Start.isnull(), 
             'Reg_Camp_Start']= combined.Reg_Camp_Start.median()

combined.loc[combined.Reg_Camp_End.isnull(), 
             'Reg_Camp_End']= combined.Reg_Camp_End.median()

combined.loc[combined.Reg_Date.isnull(), 
             'Reg_Date']= combined.Reg_Date.median()

combined.loc[combined.Reg_Month.isnull(), 
             'Reg_Month']= combined.Reg_Month.median()

combined.loc[combined.Reg_Year.isnull(), 
             'Reg_Year']= combined.Reg_Year.median()

In [38]:
combined.drop(["Patient_ID", "Health_Camp_ID", 
              "Registration_Date", "First_Interaction",
              "Camp_Start_Date", "Camp_End_Date"], axis = 1,
             inplace = True)

In [39]:
# train and test
newtrain = combined.loc[0:train.shape[0]-1, :]
newtest = combined.loc[train.shape[0]:, :]

In [40]:
newtest.drop("target", axis = 1, inplace = True)

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [42]:
# Model
log = LogisticRegression()
knn = KNeighborsClassifier()
gnb = GaussianNB()
dtree = DecisionTreeClassifier()

X = newtrain.drop("target", axis = 1)
y = newtrain.target

# Prediction
#pred_lg = log.fit(X, y).predict_proba(newtest)
#pred_knn = knn.fit(X, y).predict_proba(newtest)
#pred_gnb = gnb.fit(X, y).predict_proba(newtest)

#pd.DataFrame(pred_knn, columns = ["Zero", "One"])["One"]
pred = dtree.fit(X, y).predict_proba(newtest)

pred_tree = pd.DataFrame(pred, columns = ["Zero", "One"])["One"]

In [45]:
pred = log.fit(X, y).predict_proba(newtest)

pred_tree = pd.DataFrame(pred, columns = ["Zero", "One"])["One"]

In [46]:
pred

array([[0.48675086, 0.51324914],
       [0.63235918, 0.36764082],
       [0.75168707, 0.24831293],
       ...,
       [0.64413498, 0.35586502],
       [0.77378772, 0.22621228],
       [0.70745916, 0.29254084]])

In [47]:
pred = knn.fit(X, y).predict_proba(newtest)

pred_tree = pd.DataFrame(pred, columns = ["Zero", "One"])["One"]

In [48]:
pred

array([[0. , 1. ],
       [0. , 1. ],
       [0.6, 0.4],
       ...,
       [0.8, 0.2],
       [1. , 0. ],
       [0. , 1. ]])

In [49]:
pred = gnb.fit(X, y).predict_proba(newtest)

pred_tree = pd.DataFrame(pred, columns = ["Zero", "One"])["One"]

In [50]:
pred

array([[3.96956597e-05, 9.99960304e-01],
       [2.37717169e-02, 9.76228283e-01],
       [9.89678913e-01, 1.03210871e-02],
       ...,
       [3.77559039e-01, 6.22440961e-01],
       [9.35669399e-01, 6.43306012e-02],
       [4.17356649e-16, 1.00000000e+00]])

In [43]:
# Prepare the SUbmission File...

submission = pd.DataFrame({"Patient_ID": test.Patient_ID,
                          "Health_Camp_ID":test.Health_Camp_ID,
                          "Outcome":pred_tree})

submission.to_csv("NayawalaDtree.csv", index = False) # 0.630

In [44]:
cd

C:\Users\sachi
