In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pd.set_option('display.max_columns', None)

In [56]:
df_act = pd.read_csv('C:/Users/salizadeh/OneDrive/Documents/Sara/GitHub/Repo/Datasets/RedHat_Business_Value/act_train.csv')
df_ppl = pd.read_csv('C:/Users/salizadeh/OneDrive/Documents/Sara/GitHub/Repo/Datasets/RedHat_Business_Value/people.csv')
df_act_test = pd.read_csv('C:/Users/salizadeh/OneDrive/Documents/Sara/GitHub/Repo/Datasets/RedHat_Business_Value/act_test.csv')

__Data Description__

<br>There are two separate data files: a people file and an activity file.

<br>The people file contains all of the unique people (and the corresponding characteristics) that have performed activities over time. Each row in the people file represents a unique person. Each person has a unique people_id.

<br>The activity file contains all of the unique activities (and the corresponding activity characteristics) that each person has performed over time. Each row in the activity file represents a unique activity performed by a person on a certain date. Each activity has a unique activity_id.The activity file contains several different categories of activities. Type 1 activities are different from type 2-7 activities because there are more known characteristics associated with type 1 activities (nine in total) than type 2-7 activities (which have only one associated characteristic).

<br> This project is to predict the potential business value of a person who has performed a specific activity. The business value outcome is defined by a yes/no field attached to each unique activity in the activity file. The outcome field indicates whether or not each person has completed the outcome within a fixed window of time after each unique activity was performed.

<br>All variables are categorical, with the exception of 'char_38' in the people file, which is a continuous numerical variable.

In [5]:
df_act.head()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,,,,type 1,0
3,ppl_100,act2_3651215,2023-08-04,type 2,,,,,,,,,,type 1,0
4,ppl_100,act2_4109017,2023-08-26,type 2,,,,,,,,,,type 1,0


In [6]:
df_ppl.head()

Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,type 2,type 4,False,False,True,True,False,False,False,True,False,False,False,False,False,True,False,True,True,True,False,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,type 2,type 2,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,99
3,ppl_100004,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,True,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,76
4,ppl_100006,type 2,group 6534,type 3,2022-07-27,type 40,type 25,type 9,type 3,type 8,type 2,type 2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,True,False,84


In [58]:
def preprocess_act(data, train_set=True):
    
    if train_set:
        data_v2 = data.drop(['activity_id', 'date', 'outcome'], axis=1)
    else:
        data_v2 = data.drop(['activity_id', 'date'], axis=1)
    
    #fill the null values with type 0
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='constant', fill_value='type 0')
    data_v2 = pd.DataFrame(imputer.fit_transform(data_v2), index=data_v2.index, columns =data_v2.columns)
    
    #get the numbers only from people_id
    data_v2['people_id'] = data_v2['people_id'].apply(lambda x: int(float(x.split('_')[1])))
    
    #get the numbers only from characters
    for col in list(data_v2.columns)[1:]:
        data_v2[col] = data_v2[col].apply(lambda x: int(x.split(' ')[1]))
    
    #get the year, month and day from date
    data_v2['year'] = data.date.apply(lambda x: int(x.split('-')[0]))
    data_v2['month'] = data.date.apply(lambda x: int(x.split('-')[1]))
    data_v2['day'] = data.date.apply(lambda x: int(x.split('-')[2]))
    
    if train_set:
        data_v2 = pd.concat([data_v2, data.activity_id, data.outcome], axis=1)
    else:
        data_v2 = pd.concat([data_v2, data.activity_id], axis=1)

    return data_v2


def preprocess_ppl(data):
    
    data_v2 = data.drop('date', axis=1)
    
    #get the numbers only from people_id
    data_v2['people_id'] = data_v2['people_id'].apply(lambda x: int(float(x.split('_')[1])))
    
    #get the numbers only from characters
    for col in list(data_v2.select_dtypes('object').columns):
        data_v2[col] = data_v2[col].apply(lambda x: int(x.split(' ')[1]))
        
    #change the boolean columns to int
    for col in list(data_v2.select_dtypes('boolean').columns):
        data_v2[col] = data_v2[col].apply(lambda x: 1 if x==True else 0)
    
    #get the year, month and day from date
    data_v2['year'] = data.date.apply(lambda x: int(x.split('-')[0]))
    data_v2['month'] = data.date.apply(lambda x: int(x.split('-')[1]))
    data_v2['day'] = data.date.apply(lambda x: int(x.split('-')[2]))
    
    return data_v2    

In [57]:
df_act = preprocess_act(df_act) 
df_ppl = preprocess_ppl(df_ppl) 

In [59]:
df = df_act.merge(df_ppl, left_on='people_id', right_on='people_id', suffixes=('_act', '_ppl'))

In [60]:
df.head()

Unnamed: 0,people_id,activity_category,char_1_act,char_2_act,char_3_act,char_4_act,char_5_act,char_6_act,char_7_act,char_8_act,char_9_act,char_10_act,year_act,month_act,day_act,activity_id,outcome,char_1_ppl,group_1,char_2_ppl,char_3_ppl,char_4_ppl,char_5_ppl,char_6_ppl,char_7_ppl,char_8_ppl,char_9_ppl,char_10_ppl,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,year_ppl,month_ppl,day_ppl
0,100,4,0,0,0,0,0,0,0,0,0,76,2023,8,26,act2_1734928,0,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36,2021,6,29
1,100,2,0,0,0,0,0,0,0,0,0,1,2022,9,27,act2_2434093,0,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36,2021,6,29
2,100,2,0,0,0,0,0,0,0,0,0,1,2022,9,27,act2_3404049,0,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36,2021,6,29
3,100,2,0,0,0,0,0,0,0,0,0,1,2023,8,4,act2_3651215,0,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36,2021,6,29
4,100,2,0,0,0,0,0,0,0,0,0,1,2023,8,26,act2_4109017,0,2,17304,2,5,5,5,3,11,2,2,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36,2021,6,29


In [61]:
abs(df.corr().outcome).sort_values()

activity_category    0.002709
day_ppl              0.006837
char_5_act           0.010880
char_9_act           0.011768
char_3_act           0.012905
char_2_act           0.013320
char_8_act           0.014253
char_1_act           0.014664
char_7_act           0.016196
char_4_act           0.016641
char_6_act           0.016743
year_ppl             0.024475
char_10_act          0.027740
month_ppl            0.028531
people_id            0.032564
char_9_ppl           0.040704
day_act              0.052264
char_8_ppl           0.057736
month_act            0.059040
year_act             0.067263
char_5_ppl           0.080958
char_4_ppl           0.114526
char_7_ppl           0.142616
char_3_ppl           0.142767
char_1_ppl           0.159851
char_26              0.172723
char_29              0.187893
char_30              0.210634
char_18              0.211444
char_33              0.213261
char_35              0.215065
char_24              0.221353
char_12              0.224944
char_11   

In [62]:
X = df.drop(['activity_id', 'outcome'], axis=1)
y = df.outcome

In [63]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
#Train
randmodel = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=1)
#Evaluate
from sklearn.model_selection import cross_val_score
CVS = cross_val_score(randmodel, X, y, scoring='accuracy', cv=3)
print("="*60, "\n Cross Validation:\n", CVS, "\nMean accuracy of cross-validation:\n", CVS.mean())

 Cross Validation:
 [0.84683608 0.85229442 0.81676474] 
Mean accuracy of cross-validation:
 0.8386317478184108


In [65]:
df_act_test = preprocess_act(df_act_test, train_set=False)
df_test = df_act_test.merge(df_ppl, left_on='people_id', right_on='people_id', suffixes=('_act', '_ppl'))
X_test = df_test.drop('activity_id', axis=1)
X_test.head()

Unnamed: 0,people_id,activity_category,char_1_act,char_2_act,char_3_act,char_4_act,char_5_act,char_6_act,char_7_act,char_8_act,char_9_act,char_10_act,year_act,month_act,day_act,char_1_ppl,group_1,char_2_ppl,char_3_ppl,char_4_ppl,char_5_ppl,char_6_ppl,char_7_ppl,char_8_ppl,char_9_ppl,char_10_ppl,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,year_ppl,month_ppl,day_ppl
0,100004,1,5,10,5,1,6,1,1,7,4,0,2022,7,20,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,2022,7,20
1,100004,5,0,0,0,0,0,0,0,0,0,682,2022,7,20,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,2022,7,20
2,10001,1,12,1,5,4,6,1,1,13,10,0,2022,10,14,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,2022,10,14
3,10001,1,20,10,5,4,6,1,1,5,5,0,2022,11,27,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,2022,10,14
4,10001,5,0,0,0,0,0,0,0,0,0,3015,2022,10,15,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,2022,10,14


In [66]:
#Train
randmodel.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [67]:
#Predict
randmodel_proba = randmodel.predict_proba(X_test)
randmodel_predict = randmodel_proba[:,1]
print(randmodel_proba)
#save the predict output
output = pd.DataFrame({'activity_id': df_test.activity_id, 'outcome': randmodel_predict})
output.to_csv('RedHat-RanForest.csv', index=False)
print("Your submission was successfully saved!")

[[0.38440367 0.61559633]
 [0.38565961 0.61434039]
 [0.27312539 0.72687461]
 ...
 [0.97211396 0.02788604]
 [0.97211396 0.02788604]
 [0.9768816  0.0231184 ]]
Your submission was successfully saved!
