In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [3]:
df1 = pd.read_csv('dataset/First_Health_Camp_Attended.csv')
df2 = pd.read_csv('dataset/Second_Health_Camp_Attended.csv')
df3 = pd.read_csv('dataset/Third_Health_Camp_Attended.csv')
df1.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Donation,Health_Score,Unnamed: 4
0,506181,6560,40,0.439024,
1,494977,6560,20,0.097561,
2,518680,6560,10,0.04878,
3,509916,6560,30,0.634146,
4,488006,6560,20,0.02439,


In [4]:
df2.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Health Score
0,526631,6536,0.875136
1,509122,6536,0.7557
2,498864,6536,0.673181
3,515398,6536,0.722041
4,504624,6536,0.464712


In [5]:
df3.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Number_of_stall_visited,Last_Stall_Visited_Number
0,517875,6527,3,1
1,504692,6578,1,1
2,504692,6527,3,1
3,493167,6527,4,4
4,510954,6528,2,2


In [6]:
print(df1.shape)
print(df2.shape)
print(df3.shape)

(6218, 5)
(7819, 3)
(6515, 4)


In [7]:
df1.drop(['Donation', 'Unnamed: 4'], axis=1, inplace=True)
df3.drop(['Last_Stall_Visited_Number'], axis=1, inplace=True)

In [8]:
h1 = []
i = 0
for i in range(0, df1.shape[0]):
    if df1['Health_Score'][i] > df1['Health_Score'].mean():
        h1.append(1)
    else:
        h1.append(0)

In [9]:
h2 = []
j = 0
for j in range(0, df2.shape[0]):
    if df2['Health Score'][j] > df2['Health Score'].mean():
        h2.append(1)
    else:
        h2.append(0)

In [10]:
h3 = []
k = 0
for k in range(0, df3.shape[0]):
    if df3['Number_of_stall_visited'][k] == 0:
        h3.append(0)
    else:
        h3.append(1)

In [11]:
df1['hs_binary'] = h1
df2['hs_binary'] = h2
df3['hs_binary'] = h3

In [12]:
df1 = df1.drop('Health_Score', axis=1)
df1_copy = df1.copy()
df2 = df2.drop('Health Score', axis=1)
df2_copy = df2.copy()
df3 = df3.drop('Number_of_stall_visited', axis=1)
df3_copy = df3.copy()

In [13]:
df = pd.concat([df1_copy, df2_copy, df3_copy], axis=0).reset_index().drop('index', axis=1)

In [14]:
train_df = pd.read_csv('dataset/train_healthcare.csv')
train_df.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,489652,6578,10-Sep-05,4,0,0,0,2
1,507246,6578,18-Aug-05,45,5,0,0,7
2,523729,6534,29-Apr-06,0,0,0,0,0
3,524931,6535,07-Feb-04,0,0,0,0,0
4,521364,6529,28-Feb-06,15,1,0,0,7


In [15]:
df_new = pd.merge(df, train_df, on=['Patient_ID', 'Health_Camp_ID'], how='inner')
df_new.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,hs_binary,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,506181,6560,0,,37,20,0,0,11
1,494977,6560,0,,14,3,0,0,7
2,518680,6560,0,,5,0,0,0,1
3,509916,6560,1,,1,0,0,0,1
4,488006,6560,0,,0,0,0,0,0


In [16]:
df_new.isnull().sum()

Patient_ID             0
Health_Camp_ID         0
hs_binary              0
Registration_Date    196
Var1                   0
Var2                   0
Var3                   0
Var4                   0
Var5                   0
dtype: int64

In [17]:
df_ = df_new.dropna(axis=0).reset_index().drop('index', axis=1)

In [18]:
date = pd.to_datetime(df_['Registration_Date'], format='%d-%b-%y')

In [19]:
m = []
for i in range(0, df_.shape[0]):
    m.append(date[i].month)
df_['Months'] = m
df_.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,hs_binary,Registration_Date,Var1,Var2,Var3,Var4,Var5,Months
0,523586,6530,0,19-Oct-03,0,0,0,0,0,10
1,495377,6530,1,19-Oct-03,2,0,0,0,2,10
2,492765,6530,0,19-Oct-03,8,0,0,0,3,10
3,527827,6530,1,19-Oct-03,0,0,0,0,0,10
4,524857,6530,0,19-Oct-03,2,0,0,0,0,10


In [20]:
df_ = df_.drop('Registration_Date', axis=1)

In [21]:
data = df_[['Patient_ID', 'Health_Camp_ID', 'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Months', 'hs_binary']]

In [22]:
X = data.iloc[:, 2:-1]
y = data.iloc[:, -1:]

In [23]:
''' train test split '''
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0)

In [24]:
lg = LogisticRegression()
lg.fit(X_train, y_train)

In [25]:
test_df = pd.read_csv('dataset/test_healthcare.csv')
test_df.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,505701,6548,21-May-06,1,0,0,0,2
1,500633,6584,02-Jun-06,0,0,0,0,0
2,506945,6582,10-Aug-06,0,0,0,0,0
3,497447,6551,27-Aug-06,0,0,0,0,0
4,496446,6533,19-Sep-06,0,0,0,0,0


In [26]:
date2 = pd.to_datetime(test_df['Registration_Date'], format='%d-%b-%y')
m2 = []
for i in range(0, test_df.shape[0]):
    m2.append(date2[i].month)
test_df.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,505701,6548,21-May-06,1,0,0,0,2
1,500633,6584,02-Jun-06,0,0,0,0,0
2,506945,6582,10-Aug-06,0,0,0,0,0
3,497447,6551,27-Aug-06,0,0,0,0,0
4,496446,6533,19-Sep-06,0,0,0,0,0


In [27]:
''' creating new column months '''
test_df['Months'] = m2

''' taking important features '''
test_df = test_df[['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Months']]

In [28]:
pred = lg.predict_proba(test_df)
pred = np.argmax(y_test, axis=1)
print(lg.score(X_train, y_train))

0.6484574572607585
