# Data Preprocessing

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('train.csv')
print(df.shape)
df.head()

(19104, 13)


Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating
0,2016-01-01,1,28,Male,C23,Master,57387,2015-12-24,,1,1,2381060,2
1,2016-02-01,1,28,Male,C23,Master,57387,2015-12-24,,1,1,-665480,2
2,2016-03-01,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2
3,2017-11-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1
4,2017-12-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1


In [4]:
len(df['Emp_ID'].unique())

2381

There are only 2381 employees, but we have total 19104 records. That means we have multiple records for the same emplyee.
we will keep the latest record only which is required to determine whether an employee is currently working or not.

In [5]:
df.drop_duplicates(subset=['Emp_ID'], keep='last', inplace=True)
print(df.shape)
df.head()

(2381, 13)


Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating
2,2016-03-01,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2
4,2017-12-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1
9,2017-04-01,4,43,Male,C13,Master,65603,2016-12-07,2017-04-27,2,2,0,1
12,2016-03-01,5,29,Male,C9,College,46368,2016-01-09,2016-03-07,1,1,0,1
17,2017-12-01,6,31,Female,C11,Bachelor,78728,2017-07-31,,3,3,0,2


In [6]:
df.reset_index(drop=True, inplace=True)
print(df.shape)
df

(2381, 13)


Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating
0,2016-03-01,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2
1,2017-12-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1
2,2017-04-01,4,43,Male,C13,Master,65603,2016-12-07,2017-04-27,2,2,0,1
3,2016-03-01,5,29,Male,C9,College,46368,2016-01-09,2016-03-07,1,1,0,1
4,2017-12-01,6,31,Female,C11,Bachelor,78728,2017-07-31,,3,3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2376,2017-12-01,2784,34,Male,C24,College,82815,2012-10-15,,2,3,505480,4
2377,2017-10-01,2785,34,Female,C9,College,12105,2017-08-28,2017-10-28,1,1,0,1
2378,2016-09-01,2786,45,Male,C19,College,35370,2015-07-31,2016-09-22,2,2,0,1
2379,2016-06-01,2787,28,Female,C20,Master,69498,2015-07-21,2016-06-20,1,1,0,1


In [7]:
df.isnull().sum()

MMM-YY                    0
Emp_ID                    0
Age                       0
Gender                    0
City                      0
Education_Level           0
Salary                    0
Dateofjoining             0
LastWorkingDate         765
Joining Designation       0
Designation               0
Total Business Value      0
Quarterly Rating          0
dtype: int64

Getting the target variable from LastWorkingDate column.

In [8]:
df['LastWorkingDate'].fillna(0, inplace=True)

In [9]:
df['target'] = df['LastWorkingDate'].apply(lambda x: 0 if x==0 else 1)
df.head()

Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating,target
0,2016-03-01,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2,1
1,2017-12-01,2,31,Male,C7,Master,67016,2017-11-06,0,2,2,0,1,0
2,2017-04-01,4,43,Male,C13,Master,65603,2016-12-07,2017-04-27,2,2,0,1,1
3,2016-03-01,5,29,Male,C9,College,46368,2016-01-09,2016-03-07,1,1,0,1,1
4,2017-12-01,6,31,Female,C11,Bachelor,78728,2017-07-31,0,3,3,0,2,0


Removing unnecessary columns

In [10]:
df = df.drop(labels=['MMM-YY', 'Dateofjoining', 'LastWorkingDate'], axis=1)
df.head()

Unnamed: 0,Emp_ID,Age,Gender,City,Education_Level,Salary,Joining Designation,Designation,Total Business Value,Quarterly Rating,target
0,1,28,Male,C23,Master,57387,1,1,0,2,1
1,2,31,Male,C7,Master,67016,2,2,0,1,0
2,4,43,Male,C13,Master,65603,2,2,0,1,1
3,5,29,Male,C9,College,46368,1,1,0,1,1
4,6,31,Female,C11,Bachelor,78728,3,3,0,2,0


In [11]:
df.target.value_counts()

1    1616
0     765
Name: target, dtype: int64

Handling categorical values.

In [12]:
df['Gender'] = df['Gender'].apply(lambda x: 1 if x=='Male' else 0)
df['Education_Level'] = df['Education_Level'].apply(lambda x: 1 if x=='College' else 2 if x=='Bachelor' else 3)

In [13]:
df_dummies = pd.get_dummies(data=df.City)
df_dummies

Unnamed: 0,C1,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C2,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C3,C4,C5,C6,C7,C8,C9
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2376,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2377,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2378,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2379,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
df_with_dummies = pd.concat([df.drop(labels=['City'], axis=1), df_dummies], axis=1)
df_with_dummies

Unnamed: 0,Emp_ID,Age,Gender,Education_Level,Salary,Joining Designation,Designation,Total Business Value,Quarterly Rating,target,C1,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C2,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C3,C4,C5,C6,C7,C8,C9
0,1,28,1,3,57387,1,1,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,31,1,3,67016,2,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,4,43,1,3,65603,2,2,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5,29,1,1,46368,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,6,31,0,2,78728,3,3,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2376,2784,34,1,1,82815,2,3,505480,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2377,2785,34,0,1,12105,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2378,2786,45,1,1,35370,2,2,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2379,2787,28,0,3,69498,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Standardization

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_with_dummies[['Age', 'Salary', 'Education_Level', 'Joining Designation', 'Designation', 'Total Business Value', 'Quarterly Rating']] = sc.fit_transform(df_with_dummies[['Age', 'Salary', 'Education_Level', 'Joining Designation', 'Designation', 'Total Business Value', 'Quarterly Rating']])
df_with_dummies

Unnamed: 0,Emp_ID,Age,Gender,Education_Level,Salary,Joining Designation,Designation,Total Business Value,Quarterly Rating,target,C1,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C2,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C3,C4,C5,C6,C7,C8,C9
0,1,-0.946682,1,1.216049,-0.068616,-0.975022,-1.164953,-0.235155,0.706497,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,-0.445188,1,1.216049,0.270700,0.213676,-0.102619,-0.235155,-0.528576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,4,1.560790,1,1.216049,0.220907,0.213676,-0.102619,-0.235155,-0.528576,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5,-0.779518,1,-1.234575,-0.456914,-0.975022,-1.164953,-0.235155,-0.528576,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,6,-0.445188,0,-0.009263,0.683418,1.402374,0.959714,-0.235155,0.706497,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2376,2784,0.056307,1,-1.234575,0.827440,0.213676,0.959714,0.210421,3.176643,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2377,2785,0.056307,0,-1.234575,-1.664305,-0.975022,-1.164953,-0.235155,-0.528576,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2378,2786,1.895120,1,-1.234575,-0.844471,0.213676,-0.102619,-0.235155,-0.528576,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2379,2787,-0.946682,0,1.216049,0.358163,-0.975022,-1.164953,-0.235155,-0.528576,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Since we have to predict on the train data itself we will not split it into train-test data.

In [16]:
X = df_with_dummies.drop(labels=['Emp_ID', 'target'], axis=1)
y = df_with_dummies['target']
X.shape, y.shape

((2381, 37), (2381,))

#Training different models

##model-1 XGBClassifier

In [47]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

model_xg = XGBClassifier(n_estimators=1000, learning_rate=0.05)
model_xg.fit(X, y, early_stopping_rounds=5, 
             eval_set=[(X_test, y_test)], verbose=False)
y_pred_xg = model_xg.predict(X)

print(confusion_matrix(y_pred_xg, y))
print(accuracy_score(y_pred_xg, y))
print(f1_score(y_pred_xg, y))

[[ 467   98]
 [ 298 1518]]
0.8336833263334733
0.8846153846153846


##model-2 XGBClassifier with RandomizedSearchCV

In [28]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier

model_XGB_cv = XGBClassifier() 
params={
 "n_estimators"     : [250, 500, 1000, 750, 800, 1500, 2000],
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}
random_search=RandomizedSearchCV(model_XGB_cv,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)
random_search.fit(X, y)
random_search.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


XGBClassifier(colsample_bytree=0.7, gamma=0.0, learning_rate=0.05, max_depth=4,
              n_estimators=250)

In [48]:
model_XGB_CV = XGBClassifier(colsample_bytree=0.7, gamma=0.0, learning_rate=0.05, max_depth=4,
              n_estimators=250)

model_XGB_CV.fit(X, y, early_stopping_rounds=5, 
             eval_set=[(X_test, y_test)], verbose=False)
y_pred_xg = model_XGB_CV.predict(X)

print(confusion_matrix(y_pred_xg, y))
print(accuracy_score(y_pred_xg, y))
print(f1_score(y_pred_xg, y))

[[ 479   94]
 [ 286 1522]]
0.8404031919361613
0.889018691588785


## model-3 Kernel SVC

In [32]:
from sklearn.svm import SVC
model_svc_2 = SVC(kernel='rbf')
model_svc_2.fit(X, y)

y_pred_svc_2 = model_svc_2.predict(X)

print(confusion_matrix(y_pred_svc_2, y))
print(accuracy_score(y_pred_svc_2, y))
print(f1_score(y_pred_svc_2, y))

[[ 429   72]
 [ 336 1544]]
0.8286434271314573
0.8832951945080091


## model-4 SVC with GridSearchCV

In [30]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
model_svc = SVC()
parameters = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma':['scale', 'auto'],
    'C':[0.2, 0.4, 0.6, 0.8, 1.0]
}
cv = GridSearchCV(estimator=model_svc, param_grid=parameters)
cv.fit(X, y)
print(cv.best_params_)

{'C': 0.6, 'gamma': 'scale', 'kernel': 'linear'}


In [31]:
from sklearn.svm import SVC
model_svc = SVC(kernel='linear', gamma='scale', C=0.6)
model_svc.fit(X, y)

y_pred_svc = model_svc.predict(X)

print(confusion_matrix(y_pred_svc, y))
print(accuracy_score(y_pred_svc, y))
print(f1_score(y_pred_svc, y))

[[ 417   81]
 [ 348 1535]]
0.8198236035279295
0.8773935410117177


## model-5 KNN with GridSearchCV

In [33]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
model_knn = KNeighborsClassifier()
parameters = {
    "n_neighbors" : [5, 10, 15, 20, 25, 30, 35, 40]
}
cv = GridSearchCV(estimator=model_knn, param_grid=parameters)
cv.fit(X, y)
print(cv.best_params_)

{'n_neighbors': 30}


In [34]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
model_knn = KNeighborsClassifier(n_neighbors=30)
model_knn.fit(X, y)

y_pred_knn = model_knn.predict(X)

print(confusion_matrix(y_pred_knn, y))
print(accuracy_score(y_pred_knn, y))
print(f1_score(y_pred_knn, y))

[[ 385   80]
 [ 380 1536]]
0.8068038639227215
0.869762174405436


## model-6 RandomForest with GridSearchCV

In [35]:
from sklearn.ensemble import RandomForestClassifier
Classifier = RandomForestClassifier()
parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None], 
    "criterion": ['gini', 'entropy'],
}
cv = GridSearchCV(estimator=Classifier, param_grid=parameters)
cv.fit(X, y)
print(cv.best_params_)

{'criterion': 'gini', 'max_depth': 4, 'n_estimators': 250}


In [42]:
model_forest = RandomForestClassifier(n_estimators=250, max_depth=4, criterion='gini')
model_forest.fit(X, y)

y_pred_rand = model_forest.predict(X)

print(confusion_matrix(y_pred_rand, y))
print(accuracy_score(y_pred_rand, y))
print(f1_score(y_pred_rand, y))

[[ 474   92]
 [ 291 1524]]
0.8391432171356573
0.8883707373943457


## model-7 GaussianNB

In [43]:
from sklearn.naive_bayes import GaussianNB
model_bayes = GaussianNB()
model_bayes.fit(X, y)

y_pred_gaus = model_bayes.predict(X)

print(confusion_matrix(y_pred_gaus, y))
print(accuracy_score(y_pred_gaus, y))
print(f1_score(y_pred_gaus, y))

[[ 469  306]
 [ 296 1310]]
0.747165056698866
0.8131595282433272


## Ensembel model

In [69]:
#function which will count the main prediction using all the models
def get_main(DF, models, model_names, X):
  for i, column in enumerate(DF.columns):
    DF[column] = models[i].predict(X)

  DF['main_prediction'] = ''

  for i in range(len(DF)):
    count_1 = 0
    count_0 = 0
    for col in range(len(DF.columns)-1):
      if int(DF.iloc[i, col]) == 1:
        count_1 += 1
      else:
        count_0 += 1
    result = 0 if count_0 >= count_1 else 1
    DF['main_prediction'][i] = result

  return DF


In [70]:
models = [model_xg, model_XGB_CV, model_knn, model_forest, model_bayes, model_svc, model_svc_2]
model_names = ['model_xg', 'model_XGB_CV', 'model_knn', 'model_forest', 'model_bayes', 'model_svc', 'model_svc_2']

df_to_get_main_1 = pd.DataFrame(columns = model_names)
df_main_1 = get_main(df_to_get_main_1, models, model_names, X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [71]:
df_main_1

Unnamed: 0,model_xg,model_XGB_CV,model_knn,model_forest,model_bayes,model_svc,model_svc_2,main_prediction
0,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,0,1
...,...,...,...,...,...,...,...,...
2376,0,0,0,0,0,0,0,0
2377,1,1,1,1,1,1,1,1
2378,1,1,1,1,0,1,1,1
2379,1,1,1,1,1,1,1,1


#Preparing the submission.csv

In [55]:
df_test = pd.read_csv('test.csv')
df_test

Unnamed: 0,Emp_ID
0,394
1,173
2,1090
3,840
4,308
...,...
736,2134
737,2255
738,448
739,1644


In [56]:
emp_list = list(df_test.Emp_ID.unique())

In [57]:
column_names = df_with_dummies.columns
df_test_to_new = pd.DataFrame(columns = column_names)

for i in emp_list:
    df_test_to_new = df_test_to_new.append(df_with_dummies[df_with_dummies['Emp_ID'] == i] , ignore_index = True)
df_test_to_new

Unnamed: 0,Emp_ID,Age,Gender,Education_Level,Salary,Joining Designation,Designation,Total Business Value,Quarterly Rating,target,C1,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C2,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C3,C4,C5,C6,C7,C8,C9
0,394,0.056307,0,1.216049,1.352747,0.213676,2.022048,2.146411,1.941570,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,173,0.892131,1,-1.234575,-0.111361,-0.975022,0.959714,0.387186,1.941570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,1090,0.892131,1,-1.234575,1.318494,0.213676,2.022048,1.103406,0.706497,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,840,1.059296,0,-1.234575,1.038803,-0.975022,2.022048,0.780163,0.706497,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,308,-0.278023,1,1.216049,4.548777,0.213676,3.084381,3.090628,0.706497,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736,2134,1.059296,1,-1.234575,1.997055,0.213676,3.084381,0.404604,-0.528576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
737,2255,1.059296,1,-1.234575,2.613137,1.402374,2.022048,1.408295,1.941570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
738,448,0.557801,1,-0.009263,0.213366,0.213676,2.022048,4.223836,1.941570,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
739,1644,2.396615,0,-0.009263,1.627293,0.213676,2.022048,0.569311,-0.528576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [58]:
##X = df_with_dummies.drop(labels=['Emp_ID', 'target'], axis=1)

X_new = df_test_to_new.drop(labels=['Emp_ID', 'target'], axis=1)
X_new.shape

(741, 37)

In [59]:
X_new = X_new.astype(np.float32)

In [60]:
models = [model_xg, model_XGB_CV, model_knn, model_forest, model_bayes, model_svc, model_svc_2]
model_names = ['model_xg', 'model_XGB_CV', 'model_knn', 'model_forest', 'model_bayes', 'model_svc', 'model_svc_2']

df_to_get_main_2 = pd.DataFrame(columns = model_names)
df_main_2 = get_main(df_to_get_main_2, models, model_names, X_new)
df_main_2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Unnamed: 0,model_xg,model_XGB_CV,model_knn,model_forest,model_bayes,model_svc,model_svc_2,main
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
736,0,0,1,0,0,0,0,0
737,0,0,0,0,0,0,0,0
738,0,0,0,0,0,0,0,0
739,0,0,1,0,0,0,0,0


In [61]:
y_pred = df_main_2.main.values
sub = pd.DataFrame(data = y_pred, 
                  columns = ['predicted'])
sub

Unnamed: 0,predicted
0,0
1,0
2,0
3,0
4,0
...,...
736,0
737,0
738,0
739,0


In [62]:
sub.predicted.value_counts()

0    436
1    305
Name: predicted, dtype: int64

In [63]:
df_final = pd.concat([df_test_to_new, sub], axis=1)
df_final

Unnamed: 0,Emp_ID,Age,Gender,Education_Level,Salary,Joining Designation,Designation,Total Business Value,Quarterly Rating,target,C1,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C2,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C3,C4,C5,C6,C7,C8,C9,predicted
0,394,0.056307,0,1.216049,1.352747,0.213676,2.022048,2.146411,1.941570,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,173,0.892131,1,-1.234575,-0.111361,-0.975022,0.959714,0.387186,1.941570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,1090,0.892131,1,-1.234575,1.318494,0.213676,2.022048,1.103406,0.706497,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,840,1.059296,0,-1.234575,1.038803,-0.975022,2.022048,0.780163,0.706497,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,308,-0.278023,1,1.216049,4.548777,0.213676,3.084381,3.090628,0.706497,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736,2134,1.059296,1,-1.234575,1.997055,0.213676,3.084381,0.404604,-0.528576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
737,2255,1.059296,1,-1.234575,2.613137,1.402374,2.022048,1.408295,1.941570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
738,448,0.557801,1,-0.009263,0.213366,0.213676,2.022048,4.223836,1.941570,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
739,1644,2.396615,0,-0.009263,1.627293,0.213676,2.022048,0.569311,-0.528576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [64]:
df_final = df_final[['Emp_ID', 'predicted']]
df_final

Unnamed: 0,Emp_ID,predicted
0,394,0
1,173,0
2,1090,0
3,840,0
4,308,0
...,...,...
736,2134,0
737,2255,0
738,448,0
739,1644,0


In [65]:
df_final.to_csv('sub_8.csv')