In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
from datetime import datetime

## Data

In [2]:
act_train = pd.read_csv('act_train.csv')

In [3]:
people = pd.read_csv('people.csv')

In [4]:
people.head()

Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,...,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,...,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,...,False,False,True,True,True,True,False,True,True,99
3,ppl_100004,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,...,True,True,True,True,True,True,True,True,True,76
4,ppl_100006,type 2,group 6534,type 3,2022-07-27,type 40,type 25,type 9,type 3,type 8,...,False,False,True,False,False,False,True,True,False,84


In [5]:
act_train.head()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,,,,type 1,0
3,ppl_100,act2_3651215,2023-08-04,type 2,,,,,,,,,,type 1,0
4,ppl_100,act2_4109017,2023-08-26,type 2,,,,,,,,,,type 1,0


## Data Analysis
Checking the shape of the dataframe and checking for the null values in both dataframes

In [6]:
print(people.shape)

# Checking the null values
100*people.isnull().sum()/people.shape[0]

(189118, 41)


people_id    0.0
char_1       0.0
group_1      0.0
char_2       0.0
date         0.0
char_3       0.0
char_4       0.0
char_5       0.0
char_6       0.0
char_7       0.0
char_8       0.0
char_9       0.0
char_10      0.0
char_11      0.0
char_12      0.0
char_13      0.0
char_14      0.0
char_15      0.0
char_16      0.0
char_17      0.0
char_18      0.0
char_19      0.0
char_20      0.0
char_21      0.0
char_22      0.0
char_23      0.0
char_24      0.0
char_25      0.0
char_26      0.0
char_27      0.0
char_28      0.0
char_29      0.0
char_30      0.0
char_31      0.0
char_32      0.0
char_33      0.0
char_34      0.0
char_35      0.0
char_36      0.0
char_37      0.0
char_38      0.0
dtype: float64

In [7]:
print(act_train.shape)

# Checking the null values
100*act_train.isnull().sum()/act_train.shape[0]

(2197291, 15)


people_id             0.000000
activity_id           0.000000
date                  0.000000
activity_category     0.000000
char_1               92.826849
char_2               92.826849
char_3               92.826849
char_4               92.826849
char_5               92.826849
char_6               92.826849
char_7               92.826849
char_8               92.826849
char_9               92.826849
char_10               7.173151
outcome               0.000000
dtype: float64

As we can see that almost all data of the columns char_1 to char_9 is null, so we will remove it

## Data Preprocessing
Dropping unnecessary columns and preprocessing data further so that data 

In [8]:
def preprocess_activity_data(data):
    # Dropping these columns as we have seen above, almost all data of these columns is null
    data.drop({'char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8', 'char_9'}, axis = 1, inplace=True)
    
    # Filling the null values of this column with the mode of the column
    data['char_10'].fillna(data['char_10'].mode()[0], inplace=True)
    
    # Renaming the columns so that it makes more sense
    data.rename(columns={'date':'data_activity', 'char_10':'activity_type'}, inplace=True)

    # Combining the activity data and people data into a single dataframe
    new_data = data.merge(people, on='people_id', how='inner')

    return new_data

Now, as we can see, there is no null values in the data.

In [9]:
# Preprocessing activity training  data
train_data = preprocess_activity_data(act_train)

# Checking for null values after preprocessing
100*act_train.isnull().sum()/act_train.shape[0]

people_id            0.0
activity_id          0.0
data_activity        0.0
activity_category    0.0
activity_type        0.0
outcome              0.0
dtype: float64

In [11]:
train_data.head()

Unnamed: 0,people_id,activity_id,data_activity,activity_category,activity_type,outcome,char_1,group_1,char_2,date,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,act2_1734928,2023-08-26,type 4,type 76,0,type 2,group 17304,type 2,2021-06-29,...,False,True,True,False,False,True,True,True,False,36
1,ppl_100,act2_2434093,2022-09-27,type 2,type 1,0,type 2,group 17304,type 2,2021-06-29,...,False,True,True,False,False,True,True,True,False,36
2,ppl_100,act2_3404049,2022-09-27,type 2,type 1,0,type 2,group 17304,type 2,2021-06-29,...,False,True,True,False,False,True,True,True,False,36
3,ppl_100,act2_3651215,2023-08-04,type 2,type 1,0,type 2,group 17304,type 2,2021-06-29,...,False,True,True,False,False,True,True,True,False,36
4,ppl_100,act2_4109017,2023-08-26,type 2,type 1,0,type 2,group 17304,type 2,2021-06-29,...,False,True,True,False,False,True,True,True,False,36


Now we will preprocess the combined dataframe

In [12]:
train_data.dtypes.value_counts()

bool      28
object    16
int64      2
Name: count, dtype: int64

In [23]:
train_data['activity_id'].value_counts()

KeyError: 'activity_id'

As we can see that activity id is unique for each activity so it is of no use

In [15]:
def preprocess_data(data):
    # Converting boolean and categorical data to numerical data
    data.replace({False:0, True:1}, inplace=True)
    data['activity_category'].replace({'type 1':1, 'type 2':2, 'type 3':3, 'type 4':4, 'type 5':5, 'type 6':6, 'type 7':7}, inplace=True)

    # Converting into datetime datatype so that new feature can be generated
    data['date'] = pd.to_datetime(data['date'])
    
    # Creating new features
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day
    data['weekend'] = (data['date'].dt.weekday >= 5).astype(int)
    
    # Collecting all the categorical features
    categorical_features = [feat for feat in data.columns if data[feat].dtype == 'object']

    # Converting the categorical features to numerical features using label encoder
    le = LabelEncoder()
    for feat in categorical_features:
        data[feat] = le.fit_transform(data[feat])


preprocess_data(train_data)

In [16]:
numerical_features = [feat for feat in train_data.columns if train_data[feat].dtype != 'O']
train_data[numerical_features].corr()

Unnamed: 0,people_id,activity_id,data_activity,activity_category,activity_type,outcome,char_1,group_1,char_2,date,...,char_33,char_34,char_35,char_36,char_37,char_38,year,month,day,weekend
people_id,1.0,0.00322,-0.009011,-0.004887,-0.005223,0.001385,0.005734,0.011083,0.036055,0.012826,...,0.041938,0.02772,0.025553,0.037757,0.037999,0.01066,-0.000894,0.043564,-0.060316,-0.07595
activity_id,0.00322,1.0,-0.028132,0.189133,0.091169,0.007656,0.026232,0.012204,0.034769,-0.0391,...,0.002125,0.002305,0.001123,0.002409,0.002489,0.002939,-0.039264,0.006339,-0.001569,0.012281
data_activity,-0.009011,-0.028132,1.0,-0.187575,-0.037301,0.052349,-0.031817,0.016329,-0.009604,0.283452,...,-0.013769,-0.007056,-0.007091,-0.014664,-0.01743,0.069174,0.313649,-0.128025,0.001417,-0.05229
activity_category,-0.004887,0.189133,-0.187575,1.0,0.673256,-0.002709,0.161506,-0.019328,0.12387,0.149447,...,0.038061,0.069761,0.044875,0.076847,0.084112,0.002069,0.089172,0.147618,0.036847,-0.01344
activity_type,-0.005223,0.091169,-0.037301,0.673256,1.0,-0.014892,0.108936,-0.018399,0.073435,0.129605,...,0.018591,0.032,0.022684,0.037643,0.046357,-0.018978,0.092176,0.086995,0.014872,-0.005051
outcome,0.001385,0.007656,0.052349,-0.002709,-0.014892,1.0,-0.159851,0.361464,0.351574,0.035935,...,0.213261,0.313514,0.215065,0.316339,0.289306,0.676527,0.024475,0.028531,-0.006837,-0.061584
char_1,0.005734,0.026232,-0.031817,0.161506,0.108936,-0.159851,1.0,-0.221415,0.706502,0.145184,...,-0.010824,-0.020099,-0.011744,-0.017343,0.002253,-0.20844,0.132584,0.013683,0.013285,0.019244
group_1,0.011083,0.012204,0.016329,-0.019328,-0.018399,0.361464,-0.221415,1.0,0.178562,0.04601,...,0.074907,0.166255,0.07264,0.172975,0.157971,0.429181,0.050602,-0.016456,-0.03406,-0.037539
char_2,0.036055,0.034769,-0.009604,0.12387,0.073435,0.351574,0.706502,0.178562,1.0,0.138943,...,0.148362,0.218604,0.139853,0.217854,0.209572,0.406049,0.134633,-0.003341,-0.046548,-0.067837
date,0.012826,-0.0391,0.283452,0.149447,0.129605,0.035935,0.145184,0.04601,0.138943,1.0,...,-0.029343,0.008195,-0.015676,0.001054,-0.000144,0.035606,0.937094,0.036645,-0.021409,-0.010634


As people_id, time and activity_category have negligible impact on the outcome, so we will drop them.

In [17]:
def dropping_col(data):
    data.drop(['date', 'data_activity', 'activity_id', 'people_id', 'activity_category'], inplace=True, axis = 1)

dropping_col(train_data)

In [18]:
train_data.head()

Unnamed: 0,activity_type,outcome,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,...,char_33,char_34,char_35,char_36,char_37,char_38,year,month,day,weekend
0,5382,0,1,4691,1,38,20,4,2,2,...,0,1,1,1,0,36,2021,6,29,0
1,0,0,1,4691,1,38,20,4,2,2,...,0,1,1,1,0,36,2021,6,29,0
2,0,0,1,4691,1,38,20,4,2,2,...,0,1,1,1,0,36,2021,6,29,0
3,0,0,1,4691,1,38,20,4,2,2,...,0,1,1,1,0,36,2021,6,29,0
4,0,0,1,4691,1,38,20,4,2,2,...,0,1,1,1,0,36,2021,6,29,0


## Test data

In [19]:
act_test = pd.read_csv('act_test.csv')

In [20]:
act_test.head()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10
0,ppl_100004,act1_249281,2022-07-20,type 1,type 5,type 10,type 5,type 1,type 6,type 1,type 1,type 7,type 4,
1,ppl_100004,act2_230855,2022-07-20,type 5,,,,,,,,,,type 682
2,ppl_10001,act1_240724,2022-10-14,type 1,type 12,type 1,type 5,type 4,type 6,type 1,type 1,type 13,type 10,
3,ppl_10001,act1_83552,2022-11-27,type 1,type 20,type 10,type 5,type 4,type 6,type 1,type 1,type 5,type 5,
4,ppl_10001,act2_1043301,2022-10-15,type 5,,,,,,,,,,type 3015


In [21]:
test_data = preprocess_activity_data(act_test)

In [22]:
act_id = test_data['activity_id']

In [24]:
preprocess_data(test_data)
dropping_col(test_data)

## Scaling Data
As there are numerical features whose values are very large, so we will scale our data using standard scales

In [25]:
# Splitting into dependent and independent features
y_train = train_data['outcome']
X_train = train_data
X_train.drop('outcome', axis = 1, inplace=True)

In [26]:
X_test = test_data

In [27]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model Training
We will train three different classification models, namely, Logistic Regression, Decision Tree and Random Forest and among them we will chose the one which performs the best

In [33]:
classifiers = {'forest' : RandomForestClassifier()}

In [28]:
booster = xgb.XGBClassifier()

In [29]:
booster.fit(X_train, y_train)

In [34]:
classifiers['forest'].fit(X_train, y_train)

In [30]:
y_pred_xg = booster.predict(X_test)

In [35]:
y_pred_forest = classifiers['forest'].predict(X_test)

In [36]:
print(f"accuracy on training data : {classifiers['forest'].score(X_train, y_train)}")

accuracy on training data : 0.9833768035276165


In [37]:
output = pd.DataFrame({'activity_id':act_id,
                       'outcome':y_pred_xg})
output.to_csv('submission.csv', index=False)