In [1]:
import numpy as np
import pandas as pd 

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import os

In [2]:
train = pd.read_csv(os.path.join('train.csv'))
test = pd.read_csv(os.path.join('test.csv'))
ss = pd.read_csv(os.path.join('Sample_submission.csv'))

In [3]:
obj_cols = [i for i in train.columns if train[i].dtype == np.object]
np.size(obj_cols)

7

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for i in train.columns:
    if train[i].dtype == np.object:
        train[i] = le.fit_transform(train[i])
        test[i] = le.transform(test[i])


In [5]:
X_train = train.drop(['Id', 'Attrition', 'Behaviour'], axis=1)
y_train = train.Attrition
X_test = test.drop(['Id', 'Behaviour'], axis=1)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [7]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
from sklearn.model_selection import cross_val_score

In [9]:
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC

In [10]:
logistic = linear_model.LogisticRegression()
ridge = linear_model.Ridge()
lasso = linear_model.Lasso()
lasso_lars = linear_model.LassoLars()
elastic = linear_model.ElasticNet()
bayesian_ridge = linear_model.BayesianRidge()
sgd = linear_model.SGDClassifier()
decisontree = DTC()
randomforest = RFC()
sv = SVC()

In [11]:
models = [ridge, lasso, lasso_lars, elastic, bayesian_ridge, logistic, sgd, decisontree, randomforest, sv]

In [12]:
from sklearn.metrics import roc_auc_score as roc_auc

In [13]:
def get_cv_scores(model):
    scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=5)
    print('CV_mean: ', np.mean(scores))
    print('std:', np.std(scores))
    print('\n')
    
for model in models:
    print(model)
    try: 
        get_cv_scores(model)
    except ValueError:
        pass

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True,
          fit_path=True, max_iter=500, normalize=True, positive=False,
          precompute='auto', verbose=False)
ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, v

In [14]:
from sklearn.model_selection import GridSearchCV as GCV
from sklearn.model_selection import RandomizedSearchCV as RCV

In [24]:
penalty = ['l1', 'l2']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
solver = ['lbfgs', 'liblinear', 'saga']
class_weight = [{1:0.5, 0:0.5}, {1:0.6, 0:0.4}, {1:0.4, 0:0.6}, {1:0.7, 0:0.3}, {1:0.3, 0:0.7}]
random_state=[None, 1]

params_grid = dict(penalty=penalty,
                   C=C,
                   class_weight=class_weight,
                   solver=solver,
                   random_state=random_state)

grid = GCV(estimator=logistic, param_grid=params_grid, scoring='roc_auc', cv=5, n_jobs=-1, verbose=1 )

grid_result = grid.fit(X_train, y_train)
print('Best Score: ', grid_result.best_score_)
print('Best Parameters: ', grid_result.best_params_)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:    0.9s


Best Score:  0.8489452487737642
Best Parameters:  {'C': 0.1, 'class_weight': {1: 0.3, 0: 0.7}, 'penalty': 'l2', 'random_state': None, 'solver': 'saga'}


[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:   10.2s finished


In [37]:
logistic = linear_model.LogisticRegression(C=1, class_weight={1:0.4, 0:0.6}, penalty='l1', random_state=1, solver='saga')
logistic.fit(X_train, y_train)
y_predict=logistic.predict_proba(X_test)
y_predict

array([[0.87718253, 0.12281747],
       [0.94478339, 0.05521661],
       [0.58916714, 0.41083286],
       [0.60075249, 0.39924751],
       [0.96916741, 0.03083259],
       [0.93800466, 0.06199534],
       [0.62145701, 0.37854299],
       [0.88677403, 0.11322597],
       [0.61863481, 0.38136519],
       [0.95591263, 0.04408737],
       [0.99027868, 0.00972132],
       [0.60937388, 0.39062612],
       [0.96026398, 0.03973602],
       [0.88530862, 0.11469138],
       [0.94856766, 0.05143234],
       [0.73885956, 0.26114044],
       [0.62972823, 0.37027177],
       [0.48729206, 0.51270794],
       [0.91113482, 0.08886518],
       [0.97645714, 0.02354286],
       [0.96863893, 0.03136107],
       [0.08720153, 0.91279847],
       [0.93025653, 0.06974347],
       [0.93129193, 0.06870807],
       [0.86531343, 0.13468657],
       [0.94835821, 0.05164179],
       [0.92191015, 0.07808985],
       [0.94116089, 0.05883911],
       [0.22904442, 0.77095558],
       [0.7418335 , 0.2581665 ],
       [0.

In [38]:
ss=ss.drop('Attrition',axis=1)
ss['Attrition']=y_predict[:,1]
ss.to_csv('submission_1.csv',index=False)

In [25]:
"""random = RCV(estimator=logistic, param_distributions=params_grid, scoring='roc_auc', cv=5, n_jobs=-1, verbose=1)
random_result = random.fit(X_train, y_train)
print('Best Score: ', random_result.best_score_)
print('Best Parameters: ', random_result.best_params_)"""

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best Score:  0.8486958737816159
Best Parameters:  {'solver': 'saga', 'random_state': 1, 'penalty': 'l1', 'class_weight': {1: 0.4, 0: 0.6}, 'C': 1}


[Parallel(n_jobs=-1)]: Done  43 out of  50 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.2s finished


In [27]:
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
penalty = ['l1', 'l2', 'elasticnet']
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive']
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
eta0 = [1, 10, 100]
random_state=[None, 1]

param_distributions = dict(loss=loss,
                           penalty=penalty,
                           alpha=alpha,
                           learning_rate=learning_rate,
                           class_weight=class_weight,
                           eta0=eta0,
                           random_state=random_state)

random = RCV(estimator=sgd, param_distributions=param_distributions, scoring='roc_auc', verbose=1, n_jobs=-1, n_iter=1000)
random_result = random.fit(X_train, y_train)

print('Best Score: ', random_result.best_score_)
print('Best Params: ', random_result.best_params_)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 1260 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 3260 tasks      | elapsed:   29.1s


Best Score:  0.8491113492093403
Best Params:  {'random_state': None, 'penalty': 'elasticnet', 'loss': 'log', 'learning_rate': 'adaptive', 'eta0': 10, 'class_weight': {1: 0.4, 0: 0.6}, 'alpha': 0.001}


[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   42.7s finished


In [39]:
sgd = linear_model.SGDClassifier(random_state=None, penalty='elasticnet', loss='log', learning_rate='adaptive', eta0=10, class_weight={1: 0.4, 0: 0.6}, alpha=0.001)

In [40]:
sgd.fit(X_train, y_train)
y_predict = sgd.predict_proba(X_test)

In [41]:
ss=ss.drop('Attrition',axis=1)
ss['Attrition']=y_predict[:,1]
ss.to_csv('submission_2.csv',index=False)

In [18]:
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
kernel = ['linear']
gamma = ['scale', 'auto']
class_weight = [{1:0.5, 0:0.5}, {1:0.6, 0:0.4}, {1:0.4, 0:0.6}, {1:0.7, 0:0.3}, {1:0.3, 0:0.7}]
random_state=[None, 1]

params_grid = dict(C=C,
                   class_weight=class_weight,
                   kernel=kernel,
                   gamma=gamma,
                   random_state=random_state)
grid = GCV(estimator=sv, param_grid=params_grid, scoring='roc_auc', cv=5, n_jobs=-1, verbose=1 )

grid_result = grid.fit(X_train, y_train)
print('Best Score: ', grid_result.best_score_)
print('Best Parameters: ', grid_result.best_params_)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 288 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 658 tasks      | elapsed:  2.7min


KeyboardInterrupt: 

In [15]:
sv = SVC(C=100, class_weight={1: 0.5, 0: 0.5}, gamma='scale', kernel='rbf', probability=False)
sv.fit(X_train, y_train)
y_predict = sv.predict(X_test)


In [16]:
ss=ss.drop('Attrition',axis=1)
ss['Attrition']=y_predict
ss.to_csv('submission_1.csv',index=False)

In [17]:
y_predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,