## 1. Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## 2. Reading the datasets

In [2]:
train = pd.read_csv('data/train.csv',na_values='None')
test = pd.read_csv('data/test_l0Auv8Q.csv',na_values='None')
first_camp = pd.read_csv('data/First_Health_Camp_Attended.csv',na_values='None')
second_camp = pd.read_csv('data/Second_Health_Camp_Attended.csv',na_values='None')
third_camp = pd.read_csv('data/Third_Health_Camp_Attended.csv',na_values='None')
camp_data = pd.read_csv('data/Health_Camp_Detail.csv',na_values='None')
patient_profile = pd.read_csv('data/Patient_Profile.csv',na_values='None')

## 3. Exploring the data

In [3]:
train.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,489652,6578,10-Sep-05,4,0,0,0,2
1,507246,6578,18-Aug-05,45,5,0,0,7
2,523729,6534,29-Apr-06,0,0,0,0,0
3,524931,6535,07-Feb-04,0,0,0,0,0
4,521364,6529,28-Feb-06,15,1,0,0,7


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75278 entries, 0 to 75277
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Patient_ID         75278 non-null  int64 
 1   Health_Camp_ID     75278 non-null  int64 
 2   Registration_Date  74944 non-null  object
 3   Var1               75278 non-null  int64 
 4   Var2               75278 non-null  int64 
 5   Var3               75278 non-null  int64 
 6   Var4               75278 non-null  int64 
 7   Var5               75278 non-null  int64 
dtypes: int64(7), object(1)
memory usage: 4.6+ MB


In [5]:
camp_data.head()

Unnamed: 0,Health_Camp_ID,Camp_Start_Date,Camp_End_Date,Category1,Category2,Category3
0,6560,16-Aug-03,20-Aug-03,First,B,2
1,6530,16-Aug-03,28-Oct-03,First,C,2
2,6544,03-Nov-03,15-Nov-03,First,F,1
3,6585,22-Nov-03,05-Dec-03,First,E,2
4,6561,30-Nov-03,18-Dec-03,First,E,1


In [6]:
camp_data.nunique()

Health_Camp_ID     65
Camp_Start_Date    58
Camp_End_Date      54
Category1           3
Category2           7
Category3           2
dtype: int64

In [7]:
merged = camp_data.merge(train,on='Health_Camp_ID').drop(['Camp_Start_Date','Camp_End_Date'],axis=1)
merged_t = camp_data.merge(test,on='Health_Camp_ID').drop(['Camp_Start_Date','Camp_End_Date'],axis=1)

In [8]:
merged.head()

Unnamed: 0,Health_Camp_ID,Category1,Category2,Category3,Patient_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,6560,First,B,2,524451,,0,0,0,0,0
1,6560,First,B,2,491597,,172,66,0,1,16
2,6560,First,B,2,502372,,3,0,0,0,1
3,6560,First,B,2,509797,,3,0,0,0,1
4,6560,First,B,2,515060,,0,0,0,0,0


In [9]:
merged_1 = merged[merged.Category1=='First'].merge(first_camp.drop('Unnamed: 4',axis=1),'left',on=['Patient_ID','Health_Camp_ID'])
merged_2 = merged[merged.Category1=='Second'].merge(second_camp,'left',on=['Patient_ID','Health_Camp_ID'])
merged_3 = merged[merged.Category1=='Third'].merge(third_camp,'left', on=['Patient_ID','Health_Camp_ID'])

In [10]:
merged_1.shape[0],merged_2.shape[0],merged_3.shape[0]

(49892, 15114, 10272)

In [11]:
merged_1.head()

Unnamed: 0,Health_Camp_ID,Category1,Category2,Category3,Patient_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Donation,Health_Score
0,6560,First,B,2,524451,,0,0,0,0,0,,
1,6560,First,B,2,491597,,172,66,0,1,16,,
2,6560,First,B,2,502372,,3,0,0,0,1,50.0,0.97561
3,6560,First,B,2,509797,,3,0,0,0,1,30.0,0.707317
4,6560,First,B,2,515060,,0,0,0,0,0,,


In [12]:
merged_1['favourable_outcome'] = 1
merged_1.loc[merged_1.Health_Score.isna(),'favourable_outcome']=0

In [13]:
merged_1.head()

Unnamed: 0,Health_Camp_ID,Category1,Category2,Category3,Patient_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Donation,Health_Score,favourable_outcome
0,6560,First,B,2,524451,,0,0,0,0,0,,,0
1,6560,First,B,2,491597,,172,66,0,1,16,,,0
2,6560,First,B,2,502372,,3,0,0,0,1,50.0,0.97561,1
3,6560,First,B,2,509797,,3,0,0,0,1,30.0,0.707317,1
4,6560,First,B,2,515060,,0,0,0,0,0,,,0


In [14]:
merged_2.head()

Unnamed: 0,Health_Camp_ID,Category1,Category2,Category3,Patient_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Health Score
0,6536,Second,D,2,502400,01-Feb-05,0,0,0,0,0,0.205212
1,6536,Second,D,2,490492,19-Jan-05,0,0,0,0,0,
2,6536,Second,D,2,520675,12-Jan-05,0,0,0,0,0,
3,6536,Second,D,2,491520,16-Feb-05,0,0,0,0,0,
4,6536,Second,D,2,498259,12-Feb-05,0,0,0,0,0,


In [15]:
merged_2['favourable_outcome'] = 1
merged_2.loc[merged_2['Health Score'].isna(),'favourable_outcome']=0
merged_2.head()

Unnamed: 0,Health_Camp_ID,Category1,Category2,Category3,Patient_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Health Score,favourable_outcome
0,6536,Second,D,2,502400,01-Feb-05,0,0,0,0,0,0.205212,1
1,6536,Second,D,2,490492,19-Jan-05,0,0,0,0,0,,0
2,6536,Second,D,2,520675,12-Jan-05,0,0,0,0,0,,0
3,6536,Second,D,2,491520,16-Feb-05,0,0,0,0,0,,0
4,6536,Second,D,2,498259,12-Feb-05,0,0,0,0,0,,0


In [16]:
merged_3.head()

Unnamed: 0,Health_Camp_ID,Category1,Category2,Category3,Patient_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Number_of_stall_visited,Last_Stall_Visited_Number
0,6527,Third,G,2,505849,19-Jun-05,0,0,0,0,0,5.0,4.0
1,6527,Third,G,2,489104,03-Jul-05,0,0,0,0,0,5.0,5.0
2,6527,Third,G,2,506902,27-Apr-05,0,0,0,0,0,,
3,6527,Third,G,2,490908,16-Jun-05,0,0,0,0,0,3.0,2.0
4,6527,Third,G,2,485901,08-May-05,0,0,0,0,0,,


In [17]:
merged_3['favourable_outcome'] = 1
merged_3.loc[merged_3.Number_of_stall_visited==0,'favourable_outcome']=0
merged_3.loc[merged_3.Number_of_stall_visited.isna(),'favourable_outcome']=0

In [18]:
merged_3.head()

Unnamed: 0,Health_Camp_ID,Category1,Category2,Category3,Patient_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Number_of_stall_visited,Last_Stall_Visited_Number,favourable_outcome
0,6527,Third,G,2,505849,19-Jun-05,0,0,0,0,0,5.0,4.0,1
1,6527,Third,G,2,489104,03-Jul-05,0,0,0,0,0,5.0,5.0,1
2,6527,Third,G,2,506902,27-Apr-05,0,0,0,0,0,,,0
3,6527,Third,G,2,490908,16-Jun-05,0,0,0,0,0,3.0,2.0,1
4,6527,Third,G,2,485901,08-May-05,0,0,0,0,0,,,0


In [19]:
frames = [merged_1,merged_2,merged_3]
common_cols = list(set.intersection(*(set(df.columns) for df in frames)))
merged_with_outcome = pd.concat([df[common_cols] for df in frames], ignore_index=True)

In [20]:
ordered_cols = list(merged.columns)
ordered_cols.append('favourable_outcome')
merged_with_outcome = merged_with_outcome[ordered_cols]

In [21]:
merged_with_outcome.head()

Unnamed: 0,Health_Camp_ID,Category1,Category2,Category3,Patient_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,favourable_outcome
0,6560,First,B,2,524451,,0,0,0,0,0,0
1,6560,First,B,2,491597,,172,66,0,1,16,0
2,6560,First,B,2,502372,,3,0,0,0,1,1
3,6560,First,B,2,509797,,3,0,0,0,1,1
4,6560,First,B,2,515060,,0,0,0,0,0,0


In [22]:
merged_t.head()

Unnamed: 0,Health_Camp_ID,Category1,Category2,Category3,Patient_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,6547,First,C,2,512007,03-Apr-06,0,0,0,0,0
1,6547,First,C,2,514006,03-Apr-06,0,0,0,0,0
2,6547,First,C,2,522667,03-Apr-06,0,0,0,0,0
3,6547,First,C,2,492554,03-Apr-06,0,0,0,0,0
4,6547,First,C,2,492577,03-Apr-06,2,0,0,0,2


In [23]:
patient_profile.head()

Unnamed: 0,Patient_ID,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category
0,516956,0,0,0,0,1.0,90.0,39.0,18-Jun-03,,Software Industry
1,507733,0,0,0,0,1.0,,40.0,20-Jul-03,H,Software Industry
2,508307,0,0,0,0,3.0,87.0,46.0,02-Nov-02,D,BFSI
3,512612,0,0,0,0,1.0,75.0,47.0,02-Nov-02,D,Education
4,521075,0,0,0,0,3.0,,80.0,24-Nov-02,H,Others


In [24]:
train_data = merged_with_outcome.merge(patient_profile,on='Patient_ID')
test_data = merged_t.merge(patient_profile,on='Patient_ID')

In [25]:
train_data['Registration_Date'] = pd.to_datetime(train_data['Registration_Date'])
train_data['First_Interaction'] = pd.to_datetime(train_data['First_Interaction'])
test_data['Registration_Date'] = pd.to_datetime(test_data['Registration_Date'])
test_data['First_Interaction'] = pd.to_datetime(test_data['First_Interaction'])

cat_attr = ['Health_Camp_ID','Category1','Category2','Category3','City_Type','Employer_Category']
num_attr = [i for i in test_data.columns if i not in cat_attr ]
num_attr.remove('First_Interaction')
num_attr.remove('Registration_Date')
num_attr.remove('Patient_ID')

train_data['Registration_DOW'] = train_data['Registration_Date'].dt.dayofweek.astype('category')
train_data['Registration_month'] = train_data['Registration_Date'].dt.month.astype('category')
train_data['Registration_day'] = train_data['Registration_Date'].dt.day.astype('category')

test_data['Registration_DOW'] = test_data['Registration_Date'].dt.dayofweek.astype('category')
test_data['Registration_month'] = test_data['Registration_Date'].dt.month.astype('category')
test_data['Registration_day'] = test_data['Registration_Date'].dt.day.astype('category')

train_data['Registration_DOW'] = train_data['Registration_Date'].dt.dayofweek.astype('category')
train_data['Registration_month'] = train_data['Registration_Date'].dt.month.astype('category')
train_data['Registration_day'] = train_data['Registration_Date'].dt.day.astype('category')
train_data['Days_Since_Registration'] = (dt.datetime.now().date() - train_data['First_Interaction'].dt.date).dt.days

test_data['Registration_DOW'] = test_data['Registration_Date'].dt.dayofweek.astype('category')
test_data['Registration_month'] = test_data['Registration_Date'].dt.month.astype('category')
test_data['Registration_day'] = test_data['Registration_Date'].dt.day.astype('category')
test_data['Days_Since_Registration'] = (dt.datetime.now().date() - test_data['First_Interaction'].dt.date).dt.days

train_data.drop(['First_Interaction','Registration_Date'],axis=1,inplace=True)
test_data.drop(['First_Interaction','Registration_Date'],axis=1,inplace=True)

train_data[cat_attr] = train_data[cat_attr].astype('category')
test_data[cat_attr] = test_data[cat_attr].astype('category')
train_data[num_attr] = train_data[num_attr].astype('float')
test_data[num_attr] = test_data[num_attr].astype('float')

In [26]:
train_data.dtypes

Health_Camp_ID             category
Category1                  category
Category2                  category
Category3                  category
Patient_ID                    int64
Var1                        float64
Var2                        float64
Var3                        float64
Var4                        float64
Var5                        float64
favourable_outcome            int64
Online_Follower             float64
LinkedIn_Shared             float64
Twitter_Shared              float64
Facebook_Shared             float64
Income                      float64
Education_Score             float64
Age                         float64
City_Type                  category
Employer_Category          category
Registration_DOW           category
Registration_month         category
Registration_day           category
Days_Since_Registration       int64
dtype: object

In [42]:
X_train, X_val, y_train, y_val = train_test_split( train_data.set_index('Patient_ID').drop('favourable_outcome',axis=1),train_data.favourable_outcome , test_size=0.2, random_state=42)
X_test = test_data.set_index('Patient_ID')

In [28]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_attr),
        ('cat', categorical_transformer, cat_attr)])

In [92]:
from sklearn.linear_model import LogisticRegression
lg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [93]:
lg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [94]:
lg.score(X_train, y_train),rf.score(X_val, y_val)

(0.7980638304938394, 0.8041312433581297)

In [100]:
output = test[['Patient_ID','Health_Camp_ID']]

In [105]:
output['Outcome'] = lg.predict_proba(X_test)[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [108]:
output.to_csv('output.csv',index=False)