In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
np.random.seed(1)
import lightgbm as lgb

In [None]:
def binning_custom(value):
    if value >=20 and value <30:
        return 'bin_1'
    elif value >=30  and value <40:
        return 'bin_2'
    elif value >=40  and value <50:
        return 'bin_3'
    elif value >=50  and value <60:
        return 'bin_4'

In [None]:
traning_df = pd.read_csv('../input/WNS_Train.csv')
traning_df ['age']= traning_df['age'].apply(binning_custom)
traning_df.head()

**Creation of test Data**

In [None]:
test_df = pd.read_csv('../input/WNS_test.csv')
test_df ['age']= test_df['age'].apply(binning_custom)
def create_test_dataframe(train_df):
    train_df['education']=train_df.education.fillna('NO_EDU_DET')
    train_df['previous_year_rating'] = train_df.previous_year_rating.fillna(0)
    department_dummies = pd.get_dummies(train_df['department'])
    train_df = pd.concat([train_df, department_dummies], axis=1)
    education_dummies = pd.get_dummies(train_df['education'])
    train_df = pd.concat([train_df, education_dummies], axis=1)
    gender_dummies = pd.get_dummies(train_df['gender'])
    train_df = pd.concat([train_df, gender_dummies], axis=1)
    recruitment_channel_dummies = pd.get_dummies(train_df['recruitment_channel'])
    train_df = pd.concat([train_df, recruitment_channel_dummies], axis=1)
    
    rating_dummies = pd.get_dummies(train_df['previous_year_rating'],prefix="rating")
    train_df = pd.concat([train_df, rating_dummies], axis=1)
    
    age_dummies = pd.get_dummies(train_df['age'])
    train_df = pd.concat([train_df, age_dummies], axis=1)
    
    train_df = train_df.drop(['department','region','education','gender','recruitment_channel','employee_id','age','previous_year_rating'],axis=1)
    return train_df

test_df = create_test_dataframe(test_df)

**Creation of train Data**

In [None]:
def create_dataframe(train_df,start_index,end_index):
    train_df['education']=train_df.education.fillna('NO_EDU_DET')
    train_df['previous_year_rating'] = train_df.previous_year_rating.fillna(0)
    department_dummies = pd.get_dummies(train_df['department'])
    train_df = pd.concat([train_df, department_dummies], axis=1)
    education_dummies = pd.get_dummies(train_df['education'])
    train_df = pd.concat([train_df, education_dummies], axis=1)
    gender_dummies = pd.get_dummies(train_df['gender'])
    train_df = pd.concat([train_df, gender_dummies], axis=1)
    recruitment_channel_dummies = pd.get_dummies(train_df['recruitment_channel'])
    train_df = pd.concat([train_df, recruitment_channel_dummies], axis=1)
    
    rating_dummies = pd.get_dummies(train_df['previous_year_rating'],prefix="rating")
    train_df = pd.concat([train_df, rating_dummies], axis=1)
    
    age_dummies = pd.get_dummies(train_df['age'])
    train_df = pd.concat([train_df, age_dummies], axis=1)
    
    train_df = train_df.drop(['department','region','education','gender','recruitment_channel','employee_id','age','previous_year_rating'],axis=1)
    promotoed_df = train_df[train_df.is_promoted == 1]
    not_promoted_df = train_df[train_df.is_promoted == 0]
    subset_df =pd.concat([promotoed_df,not_promoted_df[start_index:end_index]])
    subset_df = subset_df.reset_index(drop=True)
    subset_df = subset_df.reindex(np.random.permutation(subset_df.index))
    return subset_df

In [None]:
selected_features = ['no_of_trainings', 'length_of_service', 'KPI', 'awards_won',
       'avg_training_score', 'Analytics', 'Finance', 'HR',
       'Legal', 'Operations', 'Procurement', 'R_N_D', 'Sales_Marketing',
       'Technology', 'Bachelor', 'Below_Secondary', 'master_above',
       'NO_EDU_DET', 'f', 'm', 'other', 'referred', 'sourcing', 'rating_0.0',
       'rating_1.0', 'rating_2.0', 'rating_3.0', 'rating_4.0', 'rating_5.0',
       'bin_1', 'bin_2', 'bin_3', 'bin_4']
selected_target = ['is_promoted']

In [None]:
from sklearn.metrics import f1_score
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

In [None]:
def run_lgb1(X_train, X_test, y_train, y_test, test_df):
    params = {
       "objective" : "binary",
       "n_estimators":10000,
       "reg_alpha" : 0.1,
       "reg_lambda":0.1,
       "n_jobs":-1,
       "colsample_bytree":.7,
       "min_child_weight":1,
       "subsample":0.8,
       "min_data_in_leaf":100,
       "nthread":4,
       "metric" : "f1",
       "num_leaves" : 600,
       "learning_rate" : 0.01,
       "verbosity" : -1,
       "seed": 120,
       "max_bin":60,
       'max_depth':15,
       'min_gain_to_split':.0222415,
       'scale_pos_weight':2
   }
    
    lgtrain = lgb.Dataset(X_train, label=y_train)
    lgval = lgb.Dataset(X_test, label=y_test)
    evals_result = {}
    model = lgb.train(params, lgtrain, 10000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=100, 
                      evals_result=evals_result,feval=lgb_f1_score)
    pred_test_y = model.predict(test_df, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

 **First Sub Sample**

In [None]:
main_df = create_dataframe(traning_df,0,25000)
main_df.rename(mapper={"Bachelor's":'Bachelor',"Below Secondary":'Below_Secondary',"Master's & above":'master_above',
                       "KPIs_met >80%":"KPI","R&D":"R_N_D","Sales & Marketing":"Sales_Marketing","awards_won?":"awards_won"
                              },axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
training_examples, validation_examples, training_targets, validation_targets = train_test_split(main_df[selected_features], main_df[selected_target],
                                                    test_size = 0.2, random_state = 1)

In [None]:
pred_test1, model, evals_result = run_lgb1(training_examples,validation_examples,training_targets['is_promoted'],validation_targets['is_promoted'],test_df)
print("LightGBM Training Completed...")

**Second  Sub Sample**

In [None]:
main_df = create_dataframe(traning_df,25000,50000)
main_df.rename(mapper={"Bachelor's":'Bachelor',"Below Secondary":'Below_Secondary',"Master's & above":'master_above',
                       "KPIs_met >80%":"KPI","R&D":"R_N_D","Sales & Marketing":"Sales_Marketing","awards_won?":"awards_won"
                              },axis=1,inplace=True)

from sklearn.model_selection import train_test_split
training_examples, validation_examples, training_targets, validation_targets = train_test_split(main_df[selected_features], main_df[selected_target],
                                                    test_size = 0.2, random_state = 1)

pred_test2, model, evals_result = run_lgb1(training_examples,validation_examples,training_targets['is_promoted'],validation_targets['is_promoted'],test_df)
print("LightGBM Training Completed...")


In [None]:
final_pred = (0.4*(pred_test1) + 0.6 *(pred_test2))

In [None]:
employee_df = pd.read_csv('../input/WNS_test.csv')
validation_df = pd.DataFrame(final_pred,columns=["is_promoted"])
validation_df["is_promoted"]  = validation_df["is_promoted"].apply(lambda x: 1 if x >= 0.59 else 0)
validation_df['employee_id'] = employee_df['employee_id']
validation_df= validation_df[['employee_id','is_promoted']]

In [None]:
validation_df.to_csv('tw0_subsample_wig_avg_6.csv',index=False)

In [None]:
(validation_df.is_promoted == 1).sum()

In [None]:
best_score = pd.read_csv('../input/Best_score.csv')

In [None]:
(best_score.is_promoted == 1).sum()

In [None]:
counter = 0
for index in range(len(best_score)):
    if best_score.iloc[index,1] != validation_df.iloc[index,1]:
        counter +=1
print(counter)