In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
sns.set(color_codes=True)
plt.figure(figsize=(12,7))

In [None]:
train = pd.read_csv('../input/hackerearth-employee-burnout-challenge/train.csv')
test = pd.read_csv('../input/hackerearth-employee-burnout-challenge/test.csv')
sample = pd.read_csv('../input/hackerearth-employee-burnout-challenge/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train['Date of Joining'] = pd.to_datetime(train['Date of Joining'],errors='coerce')

In [None]:
train.describe().T

In [None]:
for i in train.columns:
    print("Unique Values in Column {} are {}".format(i,len(train[i].unique())))

In [None]:
train.drop('Employee ID' ,axis=1,inplace=True)

In [None]:
train = pd.get_dummies(train,columns=['Gender', 'Company Type', 'WFH Setup Available',
       'Designation', 'Resource Allocation'],drop_first=True)

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.isnull().sum()

In [None]:
train.dropna(axis = 0,inplace=True)

In [None]:
train.isnull().sum()

In [None]:
corr_matrix = train.corr()
corr_matrix['Burn Rate'].sort_values(ascending = False)

In [None]:
sns.distplot(train['Burn Rate'])

In [None]:
train.drop_duplicates(inplace=True)

In [None]:
train.shape

In [None]:
train.info()

In [None]:
import datetime as dt
train['Date_of_Joining_year'] = train['Date of Joining'].dt.year
train['Date_of_Joining_month'] = train['Date of Joining'].dt.month
train['Date_of_Joining_week'] = train['Date of Joining'].dt.week
train['Date_of_Joining_day'] = train['Date of Joining'].dt.day
train['Date_of_Joining_dayofweek'] = train['Date of Joining'].dt.dayofweek

In [None]:
train.drop('Date of Joining',axis=1,inplace=True)

In [None]:
train.shape

In [None]:
X = train.drop('Burn Rate',axis=1)
Y = train['Burn Rate']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(np.array(X),np.array(Y),test_size=0.2,random_state=42)

In [None]:
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression,Lasso,ElasticNet,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from mlxtend.regressor import StackingCVRegressor

In [None]:
lr = LinearRegression()
lr.fit(X_train,Y_train)
pred = lr.predict(X_test)
s1 = r2_score(Y_test,pred)
s1

In [None]:
ls = Lasso(alpha=1)
ls.fit(X_train,Y_train)
pred_ls = ls.predict(X_test)
s2 = r2_score(Y_test,pred_ls)
s2

In [None]:
rd = Ridge(alpha=1)
rd.fit(X_train,Y_train)
pred_rd = rd.predict(X_test)
s3 = r2_score(Y_test,pred_rd)
s3

In [None]:
en = ElasticNet(alpha=1)
en.fit(X_train,Y_train)
pred_en = en.predict(X_test)
s4 = r2_score(Y_test,pred_en)
s4

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train,Y_train)
pred_dt = (dtr.predict(X_test))
s5 = r2_score(Y_test,pred_dt)
s5

In [None]:
cat = CatBoostRegressor(iterations=500,loss_function='MAE',eval_metric='RMSE',task_type='GPU')
cat.fit(X_train,Y_train,verbose=True)
pred_cat = cat.predict(X_test)
s6 = r2_score(Y_test,pred_cat)
s6

In [None]:
rf = RandomForestRegressor(n_estimators=800)
rf.fit(X_train,Y_train)
pred_rf = rf.predict(X_test)
s7 = r2_score(Y_test,pred_rf)
s7

In [None]:
params = {'n_estimators': 1000,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}
gd =GradientBoostingRegressor(**params)
gd.fit(X_train,Y_train)
pred_gd = gd.predict(X_test)
s8 = r2_score(Y_test,pred_gd)
s8

In [None]:
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(gd.staged_predict(X_test)):
    test_score[i] = gd.loss_(Y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, gd.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
fig.tight_layout()
plt.show()

In [None]:
xg = XGBRegressor(n_estimators = 3000,learning_rate=0.01)
xg.fit(X_train,Y_train)
pred_xg = xg.predict(X_test)
s9 = r2_score(Y_test,pred_xg)
s9

In [None]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'learning_rate': 0.001,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 1000,
    "n_estimators": 3000
}
lgb = LGBMRegressor()
lgb.fit(X_train,Y_train,eval_set = (X_test,Y_test),early_stopping_rounds=1000)
pred_lgb = lgb.predict(X_test)
s10 = r2_score(Y_test,pred_lgb)
s10

In [None]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]



random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}




rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,
                               scoring='neg_mean_squared_error',
                              n_iter=10,cv=5,verbose=2,random_state=42,n_jobs=1)

rf_random.fit(X_train,Y_train)

In [None]:

rf_random.best_params_

In [None]:
predictions = rf_random.predict(X_test)
s11 = r2_score(Y_test,predictions)
s11

In [None]:
test = pd.read_csv('../input/hackerearth-employee-burnout-challenge/test.csv')
test.head()

In [None]:
test['Date of Joining'] = pd.to_datetime(test['Date of Joining'],errors='coerce')

In [None]:
test.drop('Employee ID' ,axis=1,inplace=True)

In [None]:
test = pd.get_dummies(test,columns=['Gender', 'Company Type', 'WFH Setup Available',
       'Designation', 'Resource Allocation'],drop_first=True)

In [None]:
import datetime as dt
test['Date_of_Joining_year'] = test['Date of Joining'].dt.year
test['Date_of_Joining_month'] = test['Date of Joining'].dt.month
test['Date_of_Joining_week'] = test['Date of Joining'].dt.week
test['Date_of_Joining_day'] = test['Date of Joining'].dt.day
test['Date_of_Joining_dayofweek'] = test['Date of Joining'].dt.dayofweek

In [None]:
test.drop('Date of Joining',axis=1,inplace=True)

In [None]:
test = sc.transform(test)

In [None]:
avg = StackingCVRegressor(regressors=(lgb,rf,gd),meta_regressor=lgb,use_features_in_secondary=False)

In [None]:
avg.fit(X_train,Y_train)
test_stack = avg.predict(test)

In [None]:
test_random_rf = rf_random.predict(test)
test_lg = lgb.predict(test)
test_rf = rf.predict(test)
test_xg = xg.predict(test)
test_gd = gd.predict(test)
final_test = (test_lg*0.6 + test_random_rf*0.1 + test_rf*0.1 + test_xg*0.1 + test_gd*0.1)

In [None]:
sample.head()

In [None]:
test = pd.read_csv('../input/hackerearth-employee-burnout-challenge/test.csv')

In [None]:
submit = pd.DataFrame()
submit['Employee ID'] = test['Employee ID']

In [None]:
submit['Burn Rate'] = final_test.round(2)

In [None]:
submit.head()

In [None]:
submit.to_csv('submit.csv',index=False)