In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df_act = pd.read_csv('C:/Users/salizadeh/OneDrive/Documents/Sara/GitHub/Repo/Datasets/RedHat_Business_Value/act_train.csv')
df_ppl = pd.read_csv('C:/Users/salizadeh/OneDrive/Documents/Sara/GitHub/Repo/Datasets/RedHat_Business_Value/people.csv')
df_act_test = pd.read_csv('C:/Users/salizadeh/OneDrive/Documents/Sara/GitHub/Repo/Datasets/RedHat_Business_Value/act_test.csv')

__Data Description__

<br>There are two separate data files: a people file and an activity file.

<br>The people file contains all of the unique people (and the corresponding characteristics) that have performed activities over time. Each row in the people file represents a unique person. Each person has a unique people_id.

<br>The activity file contains all of the unique activities (and the corresponding activity characteristics) that each person has performed over time. Each row in the activity file represents a unique activity performed by a person on a certain date. Each activity has a unique activity_id.The activity file contains several different categories of activities. Type 1 activities are different from type 2-7 activities because there are more known characteristics associated with type 1 activities (nine in total) than type 2-7 activities (which have only one associated characteristic).

<br> This project is to predict the potential business value of a person who has performed a specific activity. The business value outcome is defined by a yes/no field attached to each unique activity in the activity file. The outcome field indicates whether or not each person has completed the outcome within a fixed window of time after each unique activity was performed.

<br>All variables are categorical, with the exception of 'char_38' in the people file, which is a continuous numerical variable.

In [5]:
df_act.head()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,,,,type 1,0
3,ppl_100,act2_3651215,2023-08-04,type 2,,,,,,,,,,type 1,0
4,ppl_100,act2_4109017,2023-08-26,type 2,,,,,,,,,,type 1,0


In [6]:
df_ppl.head()

Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,type 2,type 4,False,False,True,True,False,False,False,True,False,False,False,False,False,True,False,True,True,True,False,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,type 2,type 2,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,99
3,ppl_100004,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,True,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,76
4,ppl_100006,type 2,group 6534,type 3,2022-07-27,type 40,type 25,type 9,type 3,type 8,type 2,type 2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,True,False,84


In [11]:
list(df_act.columns)

['people_id',
 'activity_id',
 'date',
 'activity_category',
 'char_1',
 'char_2',
 'char_3',
 'char_4',
 'char_5',
 'char_6',
 'char_7',
 'char_8',
 'char_9',
 'char_10',
 'outcome']

In [19]:
def preprocess_act(data, train_set=True):
    
    if train_set:
        data_v2 = data.drop(['activity_id', 'date', 'outcome'], axis=1)
    else:
        data_v2 = data.drop(['activity_id', 'date'], axis=1)
    
    #fill the null values with type 0
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='constant', fill_value='type 0')
    data_v2 = pd.DataFrame(imputer.fit_transform(data_v2), index=data_v2.index, columns =data_v2.columns)
    
    #get the numbers only from people_id
    data_v2['people_id'] = data_v2['people_id'].apply(lambda x: int(float(x.split('_')[1])))
    
    #get the numbers only from characters
    for col in list(data_v2.columns)[1:]:
        print(col)
        data_v2[col] = data_v2[col].apply(lambda x: int(x.split(' ')[1]))
    
    data = pd.concat([data_v2, data.activity_id, data.date, data.outcome], axis=1)
    return data


def preprocess_ppl(data):
    
    data_v2 = data.drop('date', axis=1)
    
    #fill the null values with type 0
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='constant', fill_value='type 0')
    data_v2 = pd.DataFrame(imputer.fit_transform(data_v2), index=data_v2.index, columns =data_v2.columns)
    
    #get the numbers only from people_id
    data_v2['people_id'] = data_v2['people_id'].apply(lambda x: int(float(x.split('_')[1])))
    
    #get the numbers only from characters
    for col in list(data_v2.select columns)[1:]:
        print(col)
        data_v2[col] = data_v2[col].apply(lambda x: int(x.split(' ')[1]))
    
    data = pd.concat([data_v2, data.activity_id, data.date, data.outcome], axis=1)
    return data
        

In [20]:
df_act = preprocess_act(df_act)

activity_category
char_1
char_2
char_3
char_4
char_5
char_6
char_7
char_8
char_9
char_10


In [23]:
df_act.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197291 entries, 0 to 2197290
Data columns (total 15 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   people_id          int64 
 1   activity_category  int64 
 2   char_1             int64 
 3   char_2             int64 
 4   char_3             int64 
 5   char_4             int64 
 6   char_5             int64 
 7   char_6             int64 
 8   char_7             int64 
 9   char_8             int64 
 10  char_9             int64 
 11  char_10            int64 
 12  activity_id        object
 13  date               object
 14  outcome            int64 
dtypes: int64(13), object(2)
memory usage: 251.5+ MB


In [7]:
df = df_act.merge(df_ppl, left_on='people_id', right_on='people_id', suffixes=('_act', '_ppl'))

In [8]:
df.head()

Unnamed: 0,people_id,activity_id,date_act,activity_category,char_1_act,char_2_act,char_3_act,char_4_act,char_5_act,char_6_act,char_7_act,char_8_act,char_9_act,char_10_act,outcome,char_1_ppl,group_1,char_2_ppl,date_ppl,char_3_ppl,char_4_ppl,char_5_ppl,char_6_ppl,char_7_ppl,char_8_ppl,char_9_ppl,char_10_ppl,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,,,,type 1,0,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
3,ppl_100,act2_3651215,2023-08-04,type 2,,,,,,,,,,type 1,0,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
4,ppl_100,act2_4109017,2023-08-26,type 2,,,,,,,,,,type 1,0,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2197291 entries, 0 to 2197290
Data columns (total 55 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   people_id          object
 1   activity_id        object
 2   date_act           object
 3   activity_category  object
 4   char_1_act         object
 5   char_2_act         object
 6   char_3_act         object
 7   char_4_act         object
 8   char_5_act         object
 9   char_6_act         object
 10  char_7_act         object
 11  char_8_act         object
 12  char_9_act         object
 13  char_10_act        object
 14  outcome            int64 
 15  char_1_ppl         object
 16  group_1            object
 17  char_2_ppl         object
 18  date_ppl           object
 19  char_3_ppl         object
 20  char_4_ppl         object
 21  char_5_ppl         object
 22  char_6_ppl         object
 23  char_7_ppl         object
 24  char_8_ppl         object
 25  char_9_ppl         object
 26  char_10_ppl   

In [10]:
droped_df = df.drop(['activity_id', 'outcome'], axis=1)

In [11]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='constant', fill_value=0)
imputed_df = pd.DataFrame(imputer.fit_transform(droped_df), index=droped_df.index, columns =droped_df.columns)

In [12]:
imputed_df.head()

Unnamed: 0,people_id,date_act,activity_category,char_1_act,char_2_act,char_3_act,char_4_act,char_5_act,char_6_act,char_7_act,char_8_act,char_9_act,char_10_act,char_1_ppl,group_1,char_2_ppl,date_ppl,char_3_ppl,char_4_ppl,char_5_ppl,char_6_ppl,char_7_ppl,char_8_ppl,char_9_ppl,char_10_ppl,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,2023-08-26,type 4,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 76,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
1,ppl_100,2022-09-27,type 2,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 1,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
2,ppl_100,2022-09-27,type 2,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 1,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
3,ppl_100,2023-08-04,type 2,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 1,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
4,ppl_100,2023-08-26,type 2,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 0,type 1,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36


In [13]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
ordinal_df = pd.DataFrame(encoder.fit_transform(imputed_df), index=imputed_df.index, columns =imputed_df.columns)

In [14]:
ordinal_df.head()

Unnamed: 0,people_id,date_act,activity_category,char_1_act,char_2_act,char_3_act,char_4_act,char_5_act,char_6_act,char_7_act,char_8_act,char_9_act,char_10_act,char_1_ppl,group_1,char_2_ppl,date_ppl,char_3_ppl,char_4_ppl,char_5_ppl,char_6_ppl,char_7_ppl,char_8_ppl,char_9_ppl,char_10_ppl,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,0.0,405.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5383.0,1.0,4691.0,1.0,405.0,38.0,20.0,4.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,36.0
1,0.0,72.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4691.0,1.0,405.0,38.0,20.0,4.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,36.0
2,0.0,72.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4691.0,1.0,405.0,38.0,20.0,4.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,36.0
3,0.0,383.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4691.0,1.0,405.0,38.0,20.0,4.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,36.0
4,0.0,405.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4691.0,1.0,405.0,38.0,20.0,4.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,36.0


In [15]:
df = pd.concat([ordinal_df, df.activity_id, df.outcome], axis=1)

In [16]:
abs(df.corr().outcome).sort_values()

people_id            0.001385
activity_category    0.002709
char_4_ppl           0.003123
char_2_act           0.007398
char_9_act           0.008147
char_5_act           0.010880
char_8_act           0.012641
char_3_act           0.012976
char_1_act           0.014426
char_10_act          0.014889
char_7_act           0.016196
char_4_act           0.016641
char_6_act           0.016743
date_ppl             0.035940
char_3_ppl           0.038763
char_9_ppl           0.040704
char_7_ppl           0.047413
date_act             0.052349
char_8_ppl           0.057736
char_5_ppl           0.080958
char_1_ppl           0.159851
char_26              0.172723
char_29              0.187893
char_30              0.210634
char_18              0.211444
char_33              0.213261
char_35              0.215065
char_24              0.221353
char_12              0.224944
char_11              0.226259
char_6_ppl           0.234444
char_27              0.234790
char_14              0.247602
char_31   

In [17]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
df_drop = df.drop(['activity_id', 'outcome'], axis=1)
df_scaled = pd.DataFrame(scale.fit_transform(df_drop), index=df_drop.index, columns=df_drop.columns)
df_scaled.head()

Unnamed: 0,people_id,date_act,activity_category,char_1_act,char_2_act,char_3_act,char_4_act,char_5_act,char_6_act,char_7_act,char_8_act,char_9_act,char_10_act,char_1_ppl,group_1,char_2_ppl,date_ppl,char_3_ppl,char_4_ppl,char_5_ppl,char_6_ppl,char_7_ppl,char_8_ppl,char_9_ppl,char_10_ppl,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,-1.73083,2.063348,0.76295,-0.210644,-0.227097,-0.244006,-0.258705,-0.232454,-0.256202,-0.23411,-0.259013,-0.237667,2.475709,0.321498,-0.678519,-0.6809,-1.059867,0.806358,0.556501,-0.279203,0.181871,-1.214614,-0.706163,-0.76391,1.773358,-0.510172,-0.553939,1.364357,1.708662,-0.59582,1.651576,-0.623169,-0.464462,-0.610006,-0.51366,1.650164,-0.606963,-0.630055,-0.469734,-0.682679,-0.449736,1.812214,1.636666,-0.441534,1.976494,1.65069,-0.605081,-0.52191,1.375441,1.991719,1.430494,-0.609448,-0.387427
1,-1.73083,-0.960035,-0.759042,-0.210644,-0.227097,-0.244006,-0.258705,-0.232454,-0.256202,-0.23411,-0.259013,-0.237667,-0.730955,0.321498,-0.678519,-0.6809,-1.059867,0.806358,0.556501,-0.279203,0.181871,-1.214614,-0.706163,-0.76391,1.773358,-0.510172,-0.553939,1.364357,1.708662,-0.59582,1.651576,-0.623169,-0.464462,-0.610006,-0.51366,1.650164,-0.606963,-0.630055,-0.469734,-0.682679,-0.449736,1.812214,1.636666,-0.441534,1.976494,1.65069,-0.605081,-0.52191,1.375441,1.991719,1.430494,-0.609448,-0.387427
2,-1.73083,-0.960035,-0.759042,-0.210644,-0.227097,-0.244006,-0.258705,-0.232454,-0.256202,-0.23411,-0.259013,-0.237667,-0.730955,0.321498,-0.678519,-0.6809,-1.059867,0.806358,0.556501,-0.279203,0.181871,-1.214614,-0.706163,-0.76391,1.773358,-0.510172,-0.553939,1.364357,1.708662,-0.59582,1.651576,-0.623169,-0.464462,-0.610006,-0.51366,1.650164,-0.606963,-0.630055,-0.469734,-0.682679,-0.449736,1.812214,1.636666,-0.441534,1.976494,1.65069,-0.605081,-0.52191,1.375441,1.991719,1.430494,-0.609448,-0.387427
3,-1.73083,1.863605,-0.759042,-0.210644,-0.227097,-0.244006,-0.258705,-0.232454,-0.256202,-0.23411,-0.259013,-0.237667,-0.730955,0.321498,-0.678519,-0.6809,-1.059867,0.806358,0.556501,-0.279203,0.181871,-1.214614,-0.706163,-0.76391,1.773358,-0.510172,-0.553939,1.364357,1.708662,-0.59582,1.651576,-0.623169,-0.464462,-0.610006,-0.51366,1.650164,-0.606963,-0.630055,-0.469734,-0.682679,-0.449736,1.812214,1.636666,-0.441534,1.976494,1.65069,-0.605081,-0.52191,1.375441,1.991719,1.430494,-0.609448,-0.387427
4,-1.73083,2.063348,-0.759042,-0.210644,-0.227097,-0.244006,-0.258705,-0.232454,-0.256202,-0.23411,-0.259013,-0.237667,-0.730955,0.321498,-0.678519,-0.6809,-1.059867,0.806358,0.556501,-0.279203,0.181871,-1.214614,-0.706163,-0.76391,1.773358,-0.510172,-0.553939,1.364357,1.708662,-0.59582,1.651576,-0.623169,-0.464462,-0.610006,-0.51366,1.650164,-0.606963,-0.630055,-0.469734,-0.682679,-0.449736,1.812214,1.636666,-0.441534,1.976494,1.65069,-0.605081,-0.52191,1.375441,1.991719,1.430494,-0.609448,-0.387427


In [18]:
X = df_scaled
y = df.outcome

In [19]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
#Train
randmodel = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=1)
#Evaluate
from sklearn.model_selection import cross_val_score
CVS = cross_val_score(randmodel, X, y, scoring='accuracy', cv=5)
print("="*60, "\n Cross Validation:\n", CVS, "\nMean accuracy of cross-validation:\n", CVS.mean())

 Cross Validation:
 [0.8122669  0.82304794 0.86721598 0.74951645 0.82405372] 
Mean accuracy of cross-validation:
 0.815220197485584


In [140]:
df_test = df_act_test.merge(df_ppl, left_on='people_id', right_on='people_id', suffixes=('_act', '_ppl'))
droped_df_test = df_test.drop('activity_id', axis=1)
imputed_df_test = pd.DataFrame(imputer.transform(droped_df_test), index=droped_df_test.index, columns =droped_df_test.columns)
ordinal_df_test = pd.DataFrame(encoder.transform(imputed_df_test), index=imputed_df_test.index, columns =imputed_df_test.columns)
df_scaled_test = pd.DataFrame(scale.transform(ordinal_df_test), index=ordinal_df_test.index, columns=ordinal_df_test.columns)
X_test = df_scaled_test
X_test.head()

In [111]:
#Train
randmodel.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [112]:
#Predict
randmodel_predict = randmodel.predict(X_test)
#save the predict output
output = pd.DataFrame({'activity_id': df_test.activity_id, 'outcome': randmodel_predict})
output.to_csv('RedHat-RanForest.csv', index=False)
print("Your submission was successfully saved!")