# Description
## Employee Attrition Rate

##### Write Description

## Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import matplotlib.gridspec as gridspec
from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import matplotlib.style as style
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import missingno as msno

import os
print(os.listdir("../input"))

import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import pandas as pd
import numpy as np
import scipy

import warnings
warnings.filterwarnings('ignore')

import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy import stats
import matplotlib.style as style

## Configuration Files

In [None]:
base_path = '../input/employee-atrition-rate/Dataset'
train = pd.read_csv(os.path.join(base_path, 'Train.csv'))
test = pd.read_csv(os.path.join(base_path, 'Test.csv'))
test_for_all = test
print(f"Total number of train {len(train)} and test is {len(test)}")

## Utility Functions

In [None]:
def missing_percentage(df):
    """This function takes a DataFrame(df) as input and returns two columns, total missing values and total missing values percentage"""
    ## the two following line may seem complicated but its actually very simple. 
    total = df.isnull().sum().sort_values(ascending = False)[df.isnull().sum().sort_values(ascending = False) != 0]
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100,2)[round(df.isnull().sum().sort_values(ascending = False)/len(df)*100,2) != 0]
    return pd.concat([total, percent], axis=1, keys=['Total','Percent'])

def submit(model, filename, npy = False):
## Submission
    submission = pd.read_csv(os.path.join(base_path, 'Test.csv'))
    if npy == False:
        preds = np.expm1(model.predict(X_sub))
    else:
        preds = np.expm1(model.predict(np.array(X_sub)))
    empId = submission['Employee_ID'].tolist()
    dict = {"Employee_ID": empId, "Attrition_rate": preds}
    sub = pd.DataFrame(dict)
    sub.to_csv(filename, index=False)

def overfit_reducer(df):
    """
    This function takes in a dataframe and returns a list of features that are overfitted.
    """
    overfit = []
    for i in df.columns:
        counts = df[i].value_counts()
        zeros = counts.iloc[0]
        if zeros / len(df) * 100 > 99.94:
            overfit.append(i)
    overfit = list(overfit)
    return overfit

def plotting_3_chart(df, feature):
    """Plotting Target Variable"""
    style.use('fivethirtyeight')

    ## Creating a customized chart. and giving in figsize and everything. 
    fig = plt.figure(constrained_layout=True, figsize=(12,8))
    ## creating a grid of 3 cols and 3 rows. 
    grid = gridspec.GridSpec(ncols=3, nrows=3, figure=fig)
    #gs = fig3.add_gridspec(3, 3)

    ## Customizing the histogram grid. 
    ax1 = fig.add_subplot(grid[0, :2])
    ## Set the title. 
    ax1.set_title('Histogram')
    ## plot the histogram. 
    sns.distplot(df.loc[:,feature], norm_hist=True, ax = ax1)

    # customizing the QQ_plot. 
    ax2 = fig.add_subplot(grid[1, :2])
    ## Set the title. 
    ax2.set_title('QQ_plot')
    ## Plotting the QQ_Plot. 
    stats.probplot(df.loc[:,feature], plot = ax2)

    ## Customizing the Box Plot. 
    ax3 = fig.add_subplot(grid[:, 2])
    ## Set title. 
    ax3.set_title('Box Plot')
    ## Plotting the box plot. 
    sns.boxplot(df.loc[:,feature], orient='v', ax = ax3 );

def customized_scatterplot(y, x, c):
    """Plotting Scatter for Correlation"""
        ## Sizing the plot. 
    style.use('fivethirtyeight')
    plt.subplots(figsize = (12,8))
    plt.title(c)
    ## Plotting target variable with predictor variable(OverallQual)
    sns.scatterplot(y = y, x = x);

def fixing_skewness(df):
    """
    This function takes in a dataframe and return fixed skewed dataframe
    """
    ## Import necessary modules 
    from scipy.stats import skew
    from scipy.special import boxcox1p
    from scipy.stats import boxcox_normmax
    
    ## Getting all the data that are not of "object" type. 
    numeric_feats = df.dtypes[df.dtypes != "object"].index

    # Check the skew of all numerical features
    skewed_feats = df[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
    high_skew = skewed_feats[abs(skewed_feats) > 0.5]
    skewed_features = high_skew.index

    for feat in skewed_features:
        df[feat] = boxcox1p(df[feat], boxcox_normmax(df[feat] + 1))

## Data Exploration

In [None]:
train.head(5)

In [None]:
train.describe().T

In [None]:
s = (train.dtypes == 'object')
object_cols = list(s[s].index)
n = (train.dtypes != 'object')
num_cols = list(n[n].index)
print(f"Number of object cols {len(object_cols)} and number of numerical cols {len(num_cols)}")
train.info()

In [None]:
msno.matrix(train)  ## Missing Values

In [None]:
msno.matrix(test)

In [None]:
"train: ", missing_percentage(train), "test: ",missing_percentage(test) 

## Observations
* There are multiple type of features
* There are missing values
* There are values whose variable identity is not relieved
* The target variable is inbetween 0 and 1
* There are 7 numerical and 17 object cols. Most of the numerical cols are class type cols

## Target Variable Exploration

In [None]:
plotting_3_chart(train, 'Attrition_rate')

### Observations
* Our target is not at all normally distributed.
* Target data is right skewed.
* Continue outliers and data is mainly observed between 0.1 -> 0.22**

In [None]:
print(f"Skewness  is {train['Attrition_rate'].skew()}")
print(f"Kurtosis  is {train['Attrition_rate'].kurt()}")

### Observation
* Our target is right skewed or positive skewed with 2.056875960544357
    This means that the mode is less than mean and median. Which means more employee have artition rate low than avg.
### Kurtosis is the measure of outliers present in the distribution
* Out data is Leptokurtic since it has value greater than 3 and that means that data are heavy-tailed or profusion of outliers.
Check out for [kurtosis](http://https://codeburst.io/2-important-statistics-terms-you-need-to-know-in-data-science-skewness-and-kurtosis-388fef94eeaa)

Checking for correlation among the target variable among the rest of the features 

In [None]:
(train.corr()**2)['Attrition_rate'].sort_values(ascending = False)[1:]

#### Observation
* Found out that there is not much correlation among the target variable

##### Let's check for each highly correlated among them with a scatter plot

In [None]:
cols = ['VAR2', 'Work_Life_balance', 'Time_of_service', 'Post_Level', 'Age', 'VAR7', 'Pay_Scale', 'growth_rate', 'Time_since_promotion', \
       'VAR4', 'Travel_Rate']
for c in cols:
    customized_scatterplot(train.Attrition_rate, train[c], c)


##### Some observations
The graphs doesn't give us much idea, it doesn't give us a linear relationship.
* VAR2 > 1.5 has no data above 0.95
* Work Life balance with 5 as value has very few data over 0.5 attrition rate.
* When time of service is greater than 40 no attrition rate above 0.8   --------------> Can be used to tweek output
* Payscale is 10 then few over 0.8 attrition

### Mchine learning Perspective
* I was not able to find any linear relationship here that can be utilised.
* In categorical values also data is almost uniquely distributed. So not much use of it.

## Deleting outliers
There is not much outliers found in this case. So we will consider this later when one round completes.

In [None]:
# ## save a copy of this dataset so that any changes later on can be compared side by side.
# previous_train = train.copy()
# ## Deleting those two values with outliers. 
# train = train[(train.Work_Life_balance == 5) & (train.Attrition_rate >= 0.8)]
# train = train[(train.Time_of_service >= 40) & (train.Attrition_rate >= 0.6)]
# train = train[(train.Pay_Scale == 1) & (train.Pay_Scale == 10) & (train.Attrition_rate >= 0.8)]
# train = train[(train.Time_since_promotion==0) & (train.Attrition_rate >= 0.9)]
# train.reset_index(drop = True, inplace = True)

### Checking the assumptions of Multiple Linear Regression

Let's Check linearity with the most correlated functions
> **VAR2, Work_Life_balance, Time_of_service, Post_Level, Age**

In [None]:
# ## Plot sizing. 
# fig, (ax1, ax2) = plt.subplots(figsize = (12,8), ncols=2,sharey=False)
# ## Scatter plotting for SalePrice and GrLivArea. 
# sns.scatterplot( x = train.VAR2, y = train.Attrition_rate,  ax=ax1)
# ## Putting a regression line. 
# sns.regplot(x=train.VAR2, y=train.Attrition_rate, ax=ax1)

# ## Scatter plotting for SalePrice and MasVnrArea. 
# sns.scatterplot(x = train.Work_Life_balance,y = train.Attrition_rate, ax=ax2)
# ## regression line for MasVnrArea and SalePrice. 
# sns.regplot(x=train.Work_Life_balance, y=train.Attrition_rate, ax=ax2);

In [None]:
# plt.subplots(figsize = (12,8))
# sns.residplot(train.VAR2, train.Attrition_rate);

In [None]:
# ## Plot sizing. 
# fig, (ax1, ax2) = plt.subplots(figsize = (12,8), ncols=2,sharey=False)
# ## Scatter plotting for SalePrice and GrLivArea. 
# sns.scatterplot( x = train.Age, y = train.Attrition_rate,  ax=ax1)
# ## Putting a regression line. 
# sns.regplot(x=train.Age, y=train.Attrition_rate, ax=ax1)

# ## Scatter plotting for SalePrice and MasVnrArea. 
# sns.scatterplot(x = train.Time_of_service,y = train.Attrition_rate, ax=ax2)
# ## regression line for MasVnrArea and SalePrice. 
# sns.regplot(x=train.Time_of_service, y=train.Attrition_rate, ax=ax2);

In [None]:
# plt.subplots(figsize = (12,8))
# sns.residplot(train.Age, train.Attrition_rate);

### Observation
Linearity is very poor and causes a lot of error value dealing with this as a linear problem
Homoscedasticity ( Constant Variance ), we can say our data and independent variables has constant variance.
> One way to fix this Heteroscedasticity is by using a transformation method like log-transformation or box-cox transformation. 

Multivariate Normality ( Normality of Errors): The linear regression analysis requires the dependent variable to be multivariate normally distributed. A histogram, box plot, or a Q-Q-Plot can check if the target variable is normally distributed.

In [None]:
plotting_3_chart(train, 'Attrition_rate')

## Transforming data to be more normalized



In [None]:
train["Attrition_rate"] = np.log1p(train["Attrition_rate"])

## Plotting the newly transformed response variable
plotting_3_chart(train, 'Attrition_rate')

* No or Little multicollinearity: **Multicollinearity** is when there is a strong correlation between independent variables

In [None]:
## Plot fig sizing. 
style.use('ggplot')
sns.set_style('whitegrid')
plt.subplots(figsize = (30,20))
## Plotting heatmap. 

# Generate a mask for the upper triangle (taken from seaborn example gallery)
mask = np.zeros_like(train.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


sns.heatmap(train.corr(), 
            cmap=sns.diverging_palette(20, 220, n=200), 
            mask = mask, 
            annot=True, 
            center = 0, 
           );
## Give title. 
plt.title("Heatmap of all the Features", fontsize = 30);

### Multicollinearity is there in very few features like Age, Time of service etc.

### Inference
If I were using only multiple linear regression, I would be deleting these features from the dataset to fit better multiple linear regression algorithms. However, we will be using many algorithms as scikit learn modules makes it easy to implement them and get the best possible outcome. Therefore, we will keep all the features for now.

## Feature Engineering

In [None]:
train.drop(columns=['Employee_ID'],axis=1, inplace=True)
test.drop(columns=['Employee_ID'],axis=1, inplace=True)

## Saving the target values in "y_train". 
y = train['Attrition_rate'].reset_index(drop=True)

# getting a copy of train
previous_train = train.copy()

In [None]:
### Do some feature Engineering - I don't see much data for feature engineering. Let's use it later if needed

In [None]:
all_data = pd.concat((train, test)).reset_index(drop = True)
## Dropping the target variable. 
all_data.drop(['Attrition_rate'], axis = 1, inplace = True)

## Dealing with missing values

In [None]:
missing_percentage(all_data)

In [None]:
all_data['Time_of_service'] = all_data.groupby('Post_Level')['Time_of_service'].transform( lambda x: x.fillna(x.mean()))
all_data['Age'] = all_data.groupby('Time_of_service')['Age'].transform( lambda x: x.fillna(x.mean()))
all_data['Pay_Scale'] = all_data.groupby('Post_Level')['Pay_Scale'].transform( lambda x: x.fillna(x.mean()))
for i in ['VAR4', 'VAR2', 'Work_Life_balance']:
    all_data[i] = all_data[i].fillna(all_data[i].mean())

In [None]:
### If you want you can convert some numerical to string

In [None]:
missing_percentage(all_data)

## Fixing Skewness

In [None]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)

skewed_feats

In [None]:
sns.distplot(all_data['Age'], color='red');
sns.distplot(all_data['Time_of_service'], color='blue');

In [None]:
fixing_skewness(all_data)
sns.distplot(all_data['Age'], color='red');
sns.distplot(all_data['Time_of_service'], color='blue');

### Create new features and Dropping features

In [None]:
all_data['experience'] = all_data['Time_of_service'].apply(lambda x: 1 if x < 5 else (2 if x<20 else (3)))
all_data = all_data.drop(['VAR3', 'VAR6'], axis=1)

## Changing objective to categorical

In [None]:
print(all_data.shape)
final_features = pd.get_dummies(all_data).reset_index(drop=True)
final_features.shape

In [None]:
X = final_features.iloc[:len(y), :]

X_sub = final_features.iloc[len(y):, :]

## Removing Overfitted Features

In [None]:
overfitted_features = overfit_reducer(X)
overfitted_features

In [None]:
X = X.drop(overfitted_features, axis=1)
X_sub = X_sub.drop(overfitted_features, axis=1)

In [None]:
X.shape,y.shape, X_sub.shape

## Fitting Model

## Simple Approach

In [None]:
## Train test s
from sklearn.model_selection import train_test_split
## Train test split follows this distinguished code pattern and helps creating train and test set to build machine learning. 
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = .25, random_state = 0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Call in the LinearRegression object
lin_reg = LinearRegression(normalize=True, n_jobs=-1)
## fit train and test data. 
lin_reg.fit(X_train, y_train)
## Predict test data. 
y_pred = lin_reg.predict(X_test)
print ('RMSE with Linear Regression is %.6f'%mean_squared_error(y_test, y_pred))

In [None]:
submit(lin_reg, 'linear_model.csv')

## Using Cross Validation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
lin_reg = LinearRegression()
cv = KFold(shuffle=True, random_state=2, n_splits=10)
scores = cross_val_score(lin_reg, X,y,cv = cv, scoring = 'neg_mean_absolute_error')
print ('%.8f'%scores.mean())

## Regularization Models

### Ridge 

In [None]:
# ## Importing Ridge. 
# from sklearn.linear_model import Ridge
# from sklearn.metrics import mean_absolute_error, mean_squared_error
# ## Assiging different sets of alpha values to explore which can be the best fit for the model. 
# alpha_ridge = [-3,-2,-1,1e-15, 1e-10, 1e-8,1e-5,1e-4, 1e-3,1e-2,0.5,1,1.5, 2,3,4, 5, 10, 20, 30, 40]
# temp_rss = {}
# temp_mse = {}
# loss_min = np.Inf
# for i in alpha_ridge:
#     ## Assigin each model. 
#     ridge = Ridge(alpha= i, normalize=True)
#     ## fit the model. 
#     ridge.fit(X_train, y_train)
#     ## Predicting the target value based on "Test_x"
#     y_pred = ridge.predict(X_test)

#     mse = mean_squared_error(y_test, y_pred)
#     rss = sum((y_pred-y_test)**2)
#     temp_mse[i] = mse
#     temp_rss[i] = rss
#     if mse<loss_min:
#         loss_min = mse
#         best_model = ridge
#         best_i = i
# print(loss_min)

In [None]:
# ## Importing Ridge. 
# from sklearn.linear_model import Lasso 
# from sklearn.metrics import mean_absolute_error, mean_squared_error
# ## Assiging different sets of alpha values to explore which can be the best fit for the model. 
# alpha_ridge = [-3,-2,-1,1e-15, 1e-10, 1e-8,1e-5,1e-4, 1e-3,1e-2,0.5,1,1.5, 2,3,4, 5, 10, 20, 30, 40]
# temp_rss = {}
# temp_mse = {}
# loss_min = np.Inf
# for i in alpha_ridge:
#     ## Assigin each model. 
#     lasso = Lasso(alpha= i, normalize=True)
#     ## fit the model. 
#     lasso.fit(X_train, y_train)
#     ## Predicting the target value based on "Test_x"
#     y_pred = lasso.predict(X_test)

#     mse = mean_squared_error(y_test, y_pred)
#     rss = sum((y_pred-y_test)**2)
#     temp_mse[i] = mse
#     temp_rss[i] = rss
#     if mse<loss_min:
#         loss_min = mse
#         best_model = ridge
#         best_i = i
# print(loss_min, i)

## Fitting Model (Advanced Approach)

In [None]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [None]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [None]:
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, 
                                              alphas=alphas2, 
                                              random_state=42, 
                                              cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))

In [None]:
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)     

In [None]:
# lightgbm = LGBMRegressor(objective='regression', 
#                                        num_leaves=4,
#                                        learning_rate=0.01, 
#                                        n_estimators=5000,
#                                        max_bin=200, 
#                                        bagging_fraction=0.75,
#                                        bagging_freq=5, 
#                                        bagging_seed=7,
#                                        feature_fraction=0.2,
#                                        feature_fraction_seed=7,
#                                        verbose=-1,
#                                        )
lightgbm = LGBMRegressor(n_estimators=1,  num_leaves=100, n_jobs=-1, random_state=0)


In [None]:
xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

In [None]:
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [None]:
score = cv_rmse(ridge)
print("Ridge: {:.6f} ({:.6f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lasso)
print("LASSO: {:.6f} ({:.6f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(elasticnet)
print("elastic net: {:.6f} ({:.6f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("SVR: {:.6f} ({:.6f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("lightgbm: {:.6f} ({:.6f})\n".format(score.mean(), score.std()), datetime.now(), )

# score = cv_rmse(gbr)
# print("gbr: {:.6f} ({:.6f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("xgboost: {:.6f} ({:.6f})\n".format(score.mean(), score.std()), datetime.now(), )

In [None]:
print('START Fit and creating each submissions')

print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))
submit(stack_gen_model, 'stack_model.csv', npy=True)


print('elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)
submit(elastic_model_full_data, 'elastic_model_full_data.csv')


print('Lasso')
lasso_model_full_data = lasso.fit(X, y)
submit(lasso_model_full_data, 'lasso_model_full_data.csv')


print('Ridge') 
ridge_model_full_data = ridge.fit(X, y)
submit(ridge_model_full_data, 'ridge_model_full_data.csv')


print('Svr')
svr_model_full_data = svr.fit(X, y)
submit(svr_model_full_data, 'svr_model_full_data.csv')


# print('GradientBoosting')
# gbr_model_full_data = gbr.fit(X, y)

print('xgboost')
xgb_model_full_data = xgboost.fit(X, y)
submit(xgb_model_full_data, 'xgb_model_full_data.csv')


print('lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)
submit(lgb_model_full_data, 'lgb_model_full_data.csv')



In [None]:
def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + \
            (0.05 * lasso_model_full_data.predict(X)) + \
            (0.2 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
#             (0.1 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))

In [None]:
print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict(X)))

In [None]:
## Submission
submission = pd.read_csv(os.path.join(base_path, 'Test.csv'))
preds = np.expm1(blend_models_predict(X_sub))
empId = submission['Employee_ID'].tolist()
dict = {"Employee_ID": empId, "Attrition_rate": preds}
sub = pd.DataFrame(dict)
sub.to_csv('submission_blended.csv', index=False)