# Content

<div class="alert alert-block alert-warning" style="margin-top: 20px">
    <ol>
        <li><a href="#ingenieria">Data Engineering</a></li>          
        <li><a href="#architecture">Proposed architecture</a></li>
        <li><a href="#evaluaion">Model evaluation</a></li>
    </ol>
</div>
<br>
<hr> 

In [None]:
# Import the libraries

import pandas as pd
import numpy as np
import io 

In [None]:
# Load the dataset 

people = pd.read_csv('../input/predicting-red-hat-business-value/people.csv.zip', sep = ',')
people.head()

# We see much categorical data
# The informacion is anonymized

In [None]:
activity = pd.read_csv('../input/predicting-red-hat-business-value/act_train.csv.zip', sep = ',')
activity.head()

# <h1 id="ingenieria">Data Engineering</h1>

In [None]:
# Show the shape

print(people.shape)

# Show the null percent

100*people.isnull().sum()/people.shape[0]

In [None]:
# We repet with the other dataset 

print(activity.shape)
100*activity.isnull().sum()/activity.shape[0]

# We gonna delete the columns 90% null and fill 

In [None]:
activity.drop(columns=['char_1','char_2','char_3','char_4','char_5','char_6','char_7','char_8','char_9'],inplace=True)

print(activity.shape)

In [None]:
activity.head() 

In [None]:
# Fill char_10 with the mode

activity['char_10'] = activity['char_10'].fillna(activity["char_10"].mode()[0])

In [None]:
# We data is clean of null dates

100*activity.isnull().sum()/activity.shape[0]

In [None]:
# Rename the columns 

activity = activity.rename(columns={"date":"data_activity","char_10":"activity_type"})
activity.head()

In [None]:
# We gonna use merge to join the dataframes

all_data = activity.merge(people,on=["people_id"], how="inner")
all_data.shape

In [None]:
# Show the target  

all_data["outcome"]

In [None]:
#  Show the distribucion in the target 

100*all_data["outcome"].value_counts()/all_data.shape[0]

In [None]:
# Show the type of variable 

types = pd.DataFrame(all_data.dtypes)
print("Types of variables: ", types.groupby(0).size())

# We have to convert float to int 

In [None]:
all_data = all_data.replace({False: 0, True: 1})

In [None]:
# Is ready

types = pd.DataFrame(all_data.dtypes)
print("Types of variables replace: ", types.groupby(0).size()) 

In [None]:
# We gonna apply one second replace, As the identifier people_id it consists of a prefix "ppl_" followed by a unique number per user. 
# In this case, it is enough to cut the prefix to transform this variable into a numeric one.

all_data.people_id = all_data.people_id.str.slice(start=4).astype(float).astype(int)

types = pd.DataFrame(all_data.dtypes)
print("Second replace: ",types.groupby(0).size())

In [None]:
all_data[["activity_id", "activity_category", "group_1", "activity_type"]].head(3)

In [None]:
# And We have to do the same for those variables 

all_data.activity_id = all_data.activity_id.str.slice(start=5).astype(float).astype(int)
all_data.activity_category = all_data.activity_category.str.slice(start=5).astype(float).astype(int)
all_data.group_1 = all_data.group_1.str.slice(start=6).astype(float).astype(int)
all_data.activity_type = all_data.activity_type.str.slice(start=5).astype(float).astype(int)

types = pd.DataFrame(all_data.dtypes)
print("Thith",types.groupby(0).size()) 

In [None]:
all_data.head()

In [None]:
# We are going to evaluate the number of different variables

categorics = types.index[types[0] == 'O'].values 
for line in categorics:
    print("The variable "+ line +"contine: ", str(len(all_data[line].unique()))+" distinct values")

In [None]:
all_data.head()

In [None]:
all_data.date

In [None]:
# We gonna create stationary variables

# convert the object variable to datetime 
all_data["date"] = pd.to_datetime(all_data["date"])

# Create new variables 
all_data["day"] = all_data["date"].dt.day
all_data["day_of_week"] = all_data["date"].dt.weekday
all_data["week"] = all_data["date"].dt.week
all_data["month"] = all_data["date"].dt.month
all_data["trimester"] = all_data["date"].dt.quarter
all_data["year"] = all_data["date"].dt.year

In [None]:
all_data.head()

In [None]:
# Repet the same but with data_activity

all_data["data_activity"] = pd.to_datetime(all_data["data_activity"])
all_data["activity_day"] = all_data["data_activity"].dt.day
all_data["activity_day_of_week"] = all_data["data_activity"].dt.weekday
all_data["activity_week"] = all_data["data_activity"].dt.week
all_data["activity_month"] = all_data["data_activity"].dt.month
all_data["activity_trimester"] = all_data["data_activity"].dt.quarter
all_data["activity_year"] = all_data["data_activity"].dt.year

In [None]:
#Delete the original date columns

del(all_data["date"])
del(all_data["data_activity"])

types = pd.DataFrame(all_data.dtypes)
print("Types of variables later of 4to remplace",types.groupby(0).size())

In [None]:
all_data.head()

In [None]:
all_data.dtypes

In [None]:
# We are going to evaluate the number of different variables again 

categorics = types.index[types[0] == 'O'].values 
for line in categorics:
    print("The variable "+ line +"contine: ", str(len(all_data[line].unique()))+" distinct values")

In [None]:
# We gonna use one hot encoder for the rest of variables 

from sklearn.preprocessing import LabelEncoder,OneHotEncoder

# Define dataframe's function and the column to return a dataframe later OHE
def crea_OneHotEncoding(df, column):
  le = LabelEncoder()
  le_ajustado=le.fit_transform(df[column]).reshape(-1,1)
  encoder = OneHotEncoder(sparse=False)
  column = [column+ "_"+ str(i) for i in le.classes_]
  data = encoder.fit_transform(le_ajustado)
  return(pd.DataFrame(data,columns =column))

In [None]:
numeric_columns = list(set(types.index[types[0] =="int64"].values) - set(["outcome"]))
all_data_finish = all_data[numeric_columns]
objetive = all_data["outcome"]

categories = types.index[types[0] == 'O'].values
for column in categories:
  df = crea_OneHotEncoding(all_data,column)
  all_data_finish = pd.concat([all_data_finish,df],axis=1)
  print("Column ",column, " tranform!")

print("Finish size:",all_data_finish.shape)

In [None]:
all_data_finish.dtypes.head(40)

In [None]:
#all_data_finish['char_13'] = np.asarray(all_data_finish['char_13']).astype(np.float32)

In [None]:
#X = np.asarray(X).astype(np.float32)
'''
all_data_finish['char_25'] = np.asarray(all_data_finish['char_25']).astype(np.float32)
all_data_finish['week'] = np.asarray(all_data_finish['week']).astype(np.float32)
all_data_finish['activity_month'] = np.asarray(all_data_finish['activity_month']).astype(np.float32)
all_data_finish['char_37'] = np.asarray(all_data_finish['char_37']).astype(np.float32)
all_data_finish['activity_category'] = np.asarray(all_data_finish['activity_category']).astype(np.float32)
all_data_finish['char_31'] = np.asarray(all_data_finish['char_31']).astype(np.float32)
all_data_finish['char_19'] = np.asarray(all_data_finish['char_19']).astype(np.float32)
all_data_finish['char_30'] = np.asarray(all_data_finish['char_30']).astype(np.float32)
all_data_finish['day'] = np.asarray(all_data_finish['day']).astype(np.float32)
all_data_finish['char_21'] = np.asarray(all_data_finish['char_21']).astype(np.float32)
all_data_finish['char_10'] = np.asarray(all_data_finish['char_10']).astype(np.float32)
all_data_finish['activity_week'] = np.asarray(all_data_finish['activity_week']).astype(np.float32)
all_data_finish['char_12'] = np.asarray(all_data_finish['char_12']).astype(np.float32)
all_data_finish['trimester'] = np.asarray(all_data_finish['trimester']).astype(np.float32)

all_data_finish['char_24'] = np.asarray(all_data_finish['char_24']).astype(np.float32)
all_data_finish['char_38'] = np.asarray(all_data_finish['char_38']).astype(np.float32)
all_data_finish['activity_type'] = np.asarray(all_data_finish['activity_type']).astype(np.float32)
all_data_finish['activity_id'] = np.asarray(all_data_finish['activity_id']).astype(np.float32)
all_data_finish['day_of_week'] = np.asarray(all_data_finish['day_of_week']).astype(np.float32)
all_data_finish['char_22'] = np.asarray(all_data_finish['char_22']).astype(np.float32)
all_data_finish['char_27'] = np.asarray(all_data_finish['char_27']).astype(np.float32)
all_data_finish['char_34'] = np.asarray(all_data_finish['char_34']).astype(np.float32)
all_data_finish['activity_day_of_week'] = np.asarray(all_data_finish['activity_day_of_week']).astype(np.float32)
all_data_finish['people_id'] = np.asarray(all_data_finish['people_id']).astype(np.float32)
all_data_finish['char_36'] = np.asarray(all_data_finish['char_36']).astype(np.float32)
all_data_finish['char_32'] = np.asarray(all_data_finish['char_32']).astype(np.float32)
all_data_finish['month'] = np.asarray(all_data_finish['month']).astype(np.float32)
all_data_finish['year'] = np.asarray(all_data_finish['year']).astype(np.float32)
all_data_finish['char_14'] = np.asarray(all_data_finish['char_14']).astype(np.float32)
all_data_finish['activity_year'] = np.asarray(all_data_finish['activity_year']).astype(np.float32)
all_data_finish['activity_day'] = np.asarray(all_data_finish['activity_day']).astype(np.float32)
all_data_finish['char_17'] = np.asarray(all_data_finish['char_17']).astype(np.float32)
all_data_finish['char_14'] = np.asarray(all_data_finish['char_14']).astype(np.float32)
all_data_finish['char_23'] = np.asarray(all_data_finish['char_23']).astype(np.float32)
all_data_finish['char_16'] = np.asarray(all_data_finish['char_16']).astype(np.float32)
all_data_finish['char_26'] = np.asarray(all_data_finish['char_26']).astype(np.float32)
all_data_finish['char_20'] = np.asarray(all_data_finish['char_20']).astype(np.float32)
'''

In [None]:
'''
all_data_finish['char_29'] = np.asarray(all_data_finish['char_29']).astype(np.float32)
all_data_finish['char_11'] = np.asarray(all_data_finish['char_11']).astype(np.float32)
all_data_finish['group_1'] = np.asarray(all_data_finish['group_1']).astype(np.float32) 
'''

In [None]:
all_data_finish.dtypes.head(40)

In [None]:
objetive.head()

In [None]:
all_data_finish.dtypes.head()

In [None]:
from sklearn.model_selection import train_test_split

# Separte train set and test set  
x_train, x_test, y_train, y_test = train_test_split(all_data_finish,objetive, test_size=0.2,random_state=2020)

# Create validation set
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train, test_size=0.1, random_state=2020)

In [None]:
print("Shape of x_train:",x_train.shape)
print("Shape of x_test:",x_test.shape)
print("Shape of x_val:",x_val.shape)
print("Shape of y_train:",y_train.shape)
print("Shape of y_test:",y_test.shape)
print("Shape of y_val:",y_val.shape)

# <h1 id="architecture">Proposed Architecture</h1>

In [None]:
# We gonna use binary_crossentropy like loss function, sigmoid like wake-up function and the metric for evaluation will be the precision "accuracy"

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import plot_model

# Create the neuronal network 
model = Sequential()
model.add(Dense(256,input_dim = x_train.shape[1],activation="relu"))
model.add(Dense(256,activation="relu"))
model.add(Dense(1,activation = "sigmoid")) 
model.compile(optimizer = "Adam",loss="binary_crossentropy",metrics=["accuracy"])

print(model.summary()) 

In [None]:
plot_model(model, to_file='model.png',show_shapes=True)

In [None]:
model.fit(x_train,y_train, validation_data = (x_val,y_val),epochs=5, batch_size=128)

In [None]:
# Neuronal Network with Two Layers 
model = Sequential()
model.add(Dense(512,input_dim = x_train.shape[1],activation="relu"))
model.add(Dense(512,activation="relu"))
model.add(Dense(256,activation="relu"))
model.add(Dense(1,activation = "sigmoid"))
model.compile(optimizer = "Adam",loss="binary_crossentropy",metrics=["accuracy"])
model.fit(x_train,y_train, validation_data = (x_val,y_val),epochs=3, batch_size=64)