In [1]:
import pandas as pd
import numpy as np

In [2]:
# Loading data
churn = pd.read_csv('churn.csv')
interactions = pd.read_csv('commentInteractions.csv')
comments = pd.read_csv('comments_clean_anonimized.csv')
votes = pd.read_csv('votes.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Collecting the unique list of companies
companies = pd.Series(churn['companyAlias'].unique())

In [4]:
# Since the companyAlias is too long, lets just replace it with an index for easy identification of company 
churn['companyAlias'] = churn['companyAlias'].apply(lambda x: companies.values.tolist().index(x) if x in companies.values else -1)
interactions['companyAlias'] = interactions['companyAlias'].apply(lambda x: companies.values.tolist().index(x) if x in companies.values else -1)
comments['companyAlias'] = comments['companyAlias'].apply(lambda x: companies.values.tolist().index(x) if x in companies.values else -1)
votes['companyAlias'] = votes['companyAlias'].apply(lambda x: companies.values.tolist().index(x) if x in companies.values else -1)

In [5]:
churn.tail()

Unnamed: 0,employee,companyAlias,numVotes,lastParticipationDate,stillExists
4842,857,35,1,Fri Mar 17 15:43:58 CET 2017,True
4843,858,35,1,Fri Mar 17 17:00:00 CET 2017,True
4844,859,35,1,Fri Mar 17 17:01:54 CET 2017,True
4845,17,36,7,Tue Jun 28 02:00:00 CEST 2016,True
4846,19,36,1,Mon Jul 04 19:02:03 CEST 2016,True


In [6]:
# Employees are unique only with the company, so lets create a unique employee id
votes['employee'] = votes['companyAlias'].astype(str) + "_" + votes['employee'].astype(str)
churn['employee'] = churn['companyAlias'].astype(str) + "_" + churn['employee'].astype(str)
interactions['employee'] = interactions['companyAlias'].astype(str) + "_" + interactions['employee'].astype(str)
comments['employee'] = comments['companyAlias'].astype(str) + "_" + comments['employee'].astype(str)

In [7]:
# We need to clean the dates - contains CET and CEST text
churn['lastParticipationDate'] = churn['lastParticipationDate'].str.replace('CET','').str.replace('CEST','')
churn['lastParticipationDate'] = pd.to_datetime(churn['lastParticipationDate'],format="%a %b %d %H:%M:%S %Y")

comments['commentDate'] = comments['commentDate'].str.replace('CET','').str.replace('CEST','')
comments['commentDate'] = pd.to_datetime(comments['commentDate'],format="%a %b %d %H:%M:%S %Y")

votes['voteDate'] = votes['voteDate'].str.replace('CET','').str.replace('CEST','')
votes['voteDate'] = pd.to_datetime(votes['voteDate'],format="%a %b %d %H:%M:%S %Y")

In [8]:
churn.head()

Unnamed: 0,employee,companyAlias,numVotes,lastParticipationDate,stillExists
0,0_512,0,4,2017-02-23 12:48:04,True
1,0_-2,0,0,2017-01-18 14:00:55,False
2,0_2,0,72,2017-03-17 01:00:00,True
3,0_487,0,14,2016-11-19 15:02:14,False
4,0_3,0,22,2017-02-16 01:00:00,True


In [9]:
interactions.head()

Unnamed: 0,employee,companyAlias,liked,disliked,commentId
0,0_307,0,True,False,58d018d7e010990004e38070
1,0_36,0,True,False,58d018d7e010990004e38070
2,0_276,0,True,False,58d018d7e010990004e38070
3,0_24,0,True,False,58d018d7e010990004e38070
4,0_382,0,True,False,58d0179ae010990004e3806d


In [10]:
# Drop where companyAlias is -1
interactions = interactions[interactions.companyAlias!=-1]
# Convert True/False to 1/0
interactions.loc[:,['liked']] = interactions['liked'].astype(int)
interactions.loc[:,['disliked']] = interactions['disliked'].astype(int)

In [11]:
# How social are employees? How many times they have liked, disliked
emp_interactions_1 = interactions.groupby('employee')\
    .aggregate({'liked': 'sum','disliked': 'sum'})\
    .reset_index()
emp_interactions_1 = emp_interactions_1.rename(columns={'liked':'total_liked', 'disliked':'total_disliked'})
emp_interactions_1.head()

Unnamed: 0,employee,total_liked,total_disliked
0,0_-2,3,0
1,0_-4,4,2
2,0_-6,2,0
3,0_10,29,6
4,0_100,288,2


In [12]:
comments.head()

Unnamed: 0,employee,companyAlias,commentId,txt,likes,dislikes,commentDate
0,0_307,0,58d018d7e010990004e38070,**********************************************...,4.0,0.0,2017-03-20 19:00:17
1,0_382,0,58d0179ae010990004e3806d,*****************************,1.0,2.0,2017-03-20 18:55:16
2,0_172,0,58cff8cde010990004e37f6a,***************************,3.0,0.0,2017-03-20 16:44:02
3,0_135,0,58cfefeee010990004e37f60,***************************,1.0,1.0,2017-03-20 16:06:08
4,0_225,0,58cfd9b4e010990004e37f52,*********************************,3.0,2.0,2017-03-20 14:30:50


In [13]:
# txt column is hardly useful. Maybe length of comment might be a useful feature
# But we have a lot of NANs in txt. 
# So lets assign those txt as blank as comment could have been emoji or gif because it had likes and dislikes
comments = comments.dropna(subset=['commentDate'])
comments['txt'] = comments['txt'].fillna('')
comments['txt'] = comments['txt'].apply(lambda x: len(x))

In [14]:
comments.head()

Unnamed: 0,employee,companyAlias,commentId,txt,likes,dislikes,commentDate
0,0_307,0,58d018d7e010990004e38070,62,4.0,0.0,2017-03-20 19:00:17
1,0_382,0,58d0179ae010990004e3806d,29,1.0,2.0,2017-03-20 18:55:16
2,0_172,0,58cff8cde010990004e37f6a,27,3.0,0.0,2017-03-20 16:44:02
3,0_135,0,58cfefeee010990004e37f60,27,1.0,1.0,2017-03-20 16:06:08
4,0_225,0,58cfd9b4e010990004e37f52,33,3.0,2.0,2017-03-20 14:30:50


In [15]:
# How social are employees? How many times they have commented, received likes, dislikes
emp_interactions_2 = comments.groupby('employee')\
    .aggregate({'commentId': 'count','txt': 'mean','likes':'sum','dislikes':'sum'})\
    .reset_index()
emp_interactions_2 = emp_interactions_2.rename(columns={'commentId':'total_comments', 'txt':'avg_comment_len', 'likes':'likes_received', 'dislikes':'dislikes_received'})
emp_interactions_2.head()

Unnamed: 0,employee,total_comments,avg_comment_len,likes_received,dislikes_received
0,0_-3,1,92.0,6.0,0.0
1,0_-4,22,54.181818,116.0,46.0
2,0_-5,13,202.692308,66.0,72.0
3,0_-7,3,117.333333,10.0,1.0
4,0_10,6,86.333333,16.0,20.0


In [16]:
# Joining both the data sets to get all social activities of employee
emp_social_data = pd.merge(emp_interactions_1, emp_interactions_2, on=['employee'], how = 'left')
# Missing ones those who just never wrote a comment. Hence imputing zero
emp_social_data = emp_social_data.fillna(0)

In [17]:
votes.head()

Unnamed: 0,employee,companyAlias,voteDate,vote
0,0_31,0,2016-02-01 01:00:00,4
1,0_33,0,2016-02-01 01:00:00,4
2,0_79,0,2016-02-01 01:00:00,4
3,0_94,0,2016-02-01 01:00:00,4
4,0_16,0,2016-02-01 01:00:00,2


In [19]:
# How happy are employees?
emp_votes = votes.groupby('employee')\
    .aggregate({'vote': 'mean','voteDate': 'count'})\
    .rename(columns={'vote':'avg_vote', 'voteDate':'num_votes'})\
    .reset_index()
emp_votes['avg_vote'] = emp_votes['avg_vote'].apply(lambda x: int(round(x)))

In [20]:
emp_votes.head()

Unnamed: 0,employee,avg_vote,num_votes
0,0_10,3,56
1,0_100,4,187
2,0_101,3,312
3,0_102,4,245
4,0_103,4,201


In [21]:
# Segregating employees who remain and employess who left
emp_stillExists = churn[churn.stillExists]
emp_churn = churn[churn.stillExists==False]

In [22]:
# Joining employees who churned with emp_votes data
emp_churn_votes = pd.merge(emp_churn, emp_votes, on=['employee'], how = 'left')
emp_churn_votes['avg_vote'] = emp_churn_votes['avg_vote'].fillna(0)
emp_churn_votes['num_votes'] = emp_churn_votes['num_votes'].fillna(0)

In [23]:
emp_churn_votes.head()

Unnamed: 0,employee,companyAlias,numVotes,lastParticipationDate,stillExists,avg_vote,num_votes
0,0_-2,0,0,2017-01-18 14:00:55,False,0.0,0.0
1,0_487,0,14,2016-11-19 15:02:14,False,3.0,14.0
2,0_-4,0,0,2016-11-07 17:41:56,False,0.0,0.0
3,0_475,0,15,2016-11-06 19:38:30,False,3.0,15.0
4,0_-6,0,0,2016-10-25 17:17:21,False,0.0,0.0


In [24]:
emp_churn_votes['avg_vote'].value_counts()/emp_churn_votes.shape[0]

0.0    0.365417
3.0    0.352641
2.0    0.135434
4.0    0.120102
1.0    0.026405
Name: avg_vote, dtype: float64

Of all people who have churned 12 % voted happy on average and 36% never voted