# People Analytics - Data Analysis

**Importing libraries and modules**

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from imblearn.over_sampling import SMOTE
from scipy.stats import loguniform

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression

# Dealing with warnings
import warnings
warnings.filterwarnings('ignore')

# Setting DataFrame's to show 100 max columns, instead of compressing then
pd.set_option('display.max_columns', 100)

## Loading dataset to Jupyter Notebook

In [3]:
people = pd.read_csv('../raw_data/people_train.csv')
people_f = pd.read_csv('../raw_data/.people_clean.csv')

In [4]:
people.head()

Unnamed: 0,employee_id,age,attrition,daily_rate,distance_from_home,education,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_satisfaction,monthly_income,monthly_rate,num_companies_worked,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager,business_travel_Non-Travel,business_travel_Travel_Frequently,business_travel_Travel_Rarely,department_Human Resources,department_Research & Development,department_Sales,education_field_Human Resources,education_field_Life Sciences,education_field_Marketing,education_field_Medical,education_field_Other,education_field_Technical Degree,job_role_Healthcare Representative,job_role_Human Resources,job_role_Laboratory Technician,job_role_Manager,job_role_Manufacturing Director,job_role_Research Director,job_role_Research Scientist,job_role_Sales Executive,job_role_Sales Representative,marital_status_Divorced,marital_status_Married,marital_status_Single
0,456,-0.5,0,-0.340456,-0.166667,3,4,1,-0.972222,2,2,3,0.240007,-0.694698,-0.333333,0,-0.166667,3,3,3,0.111111,-1.0,3,0.833333,1.2,0.0,-0.2,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,485,-0.25,1,-0.653846,-0.166667,3,4,1,-0.861111,3,1,3,-0.359003,-0.394561,-0.333333,1,-0.166667,3,2,0,-0.888889,-1.0,3,-0.666667,-0.6,-0.333333,-0.6,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
2,1049,0.75,0,-0.029915,1.416667,4,2,0,-0.805556,3,1,2,-0.482582,-0.461706,-0.333333,0,0.333333,3,1,0,-0.333333,0.0,3,0.166667,0.0,-0.333333,0.2,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,1102,-0.666667,0,0.122507,0.666667,2,1,0,-0.416667,3,1,3,-0.477081,0.681382,-0.333333,0,0.333333,3,4,1,-0.555556,2.0,2,-0.166667,-0.2,0.333333,-0.2,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
4,806,-0.5,0,0.159544,-0.5,3,4,0,-0.75,1,1,3,-0.178951,0.834154,0.333333,0,1.166667,4,3,2,-0.222222,-1.0,3,-0.5,-0.2,-0.333333,-0.2,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


## Dataprep

1. <s>Missing Data</s> (Already done in data cleaning)
2. <s>Scaling</s> (Already done in data cleaning)
3. <s>Outliers</s> (Already treated in data cleaning)
4. <s>Encoding</s> (Already treated in data cleaning)
5. Balancing
6. Feature engineering
7. Feature selection

### Target balancing

**If dataset's target variable is unballenced (disproportional occurences of 0's and 1's), we should study the possibility to balance it with the appropriate technique.**

In [10]:
# Reframing People dataset into predictors (X) and target variables (y)
X = people.drop('attrition', axis = 1)
y = people['attrition']

In [8]:
# Checking dataset's shape
X.shape, y.shape

((1029, 50), (1029,))

In [9]:
# Check the target variable classes distribution
print(sorted(Counter(y).items()))

[(0, 862), (1, 167)]


As expected, dataset is unballanced. We should apply an oversampling technique.

**Performing SMOTE oversampling techcnique**

In [13]:
# Applying SMOTE (suffixes '_res' for resampling)
X_res, y_res = SMOTE(random_state = 0).fit_resample(X, y)

In [14]:
# Checking resamples shape
X_res.shape, y_res.shape

((1724, 50), (1724,))

In [15]:
# Check the target variable classes distribution after SMOTE
print(sorted(Counter(y_res).items()))

[(0, 862), (1, 862)]


Now we have the same amount of `0`'s and `1`'s for `attrition` (here referred as `y`). We can move to the next task.

In [None]:
# Correlation Matrix
corr = people.corr()
plt.figure(figsize = (15, 15))
sns.heatmap(corr, annot = True)

In [None]:
people.shape

In [None]:
print('rank(A):', np.linalg.matrix_rank(people))

In [None]:
# Checking VIF
df = pd.DataFrame()
df['vif_index'] = [vif(people.values, i) for i in range(people.shape[1])]
df['features'] = people[feats].columns

In [None]:
feats = [
 'employee_id',
 'age',
#  'attrition',
 'daily_rate',
 'distance_from_home',
 'education',
 'environment_satisfaction',
#  'gender',
 'hourly_rate',
 'job_involvement',
#  'job_level',
#  'job_satisfaction',
#  'monthly_income',
 'monthly_rate',
 'num_companies_worked',
 'over_time',
 'percent_salary_hike',
#  'performance_rating',
 'relationship_satisfaction',
 'stock_option_level',
#  'total_working_years',
 'training_times_last_year',
 'work_life_balance',
 'years_at_company',
 'years_in_current_role',
 'years_since_last_promotion',
 'years_with_curr_manager',
 'business_travel_Non-Travel',
 'business_travel_Travel_Frequently',
 'business_travel_Travel_Rarely',
 'department_Human Resources',
 'department_Research & Development',
 'department_Sales',
 'education_field_Human Resources',
 'education_field_Life Sciences',
 'education_field_Marketing',
 'education_field_Medical',
 'education_field_Other',
 'education_field_Technical Degree',
 'job_role_Healthcare Representative',
 'job_role_Human Resources',
 'job_role_Laboratory Technician',
 'job_role_Manager',
 'job_role_Manufacturing Director',
 'job_role_Research Director',
 'job_role_Research Scientist',
 'job_role_Sales Executive',
 'job_role_Sales Representative',
 'marital_status_Divorced',
 'marital_status_Married',
 'marital_status_Single'
]

In [None]:
df = pd.DataFrame()
df['vif_index'] = [vif(people[feats].values, i) for i in range(people[feats].shape[1])]
df['features'] = people[feats].columns

In [None]:
df

In [None]:
from sklearn.feature_selection import chi2

In [None]:
dist = loguniform(0.01, 1)

In [None]:
from sklearn.svm import SVC

In [None]:
svclass_model = SVC()
svclass_model.fit(X_res[feats], y_res)

In [None]:
X_res[feats]

In [None]:
svclass_model.score(X_res[feats], y_res)

In [None]:
X_raw = people_f.drop('attrition', axis = 1)
y_raw = people_f['attrition']