In [None]:
import pandas as pd
import numpy as np

In [None]:
def create_dataframe(train_df):
    train_df['education']=train_df.education.fillna('NO_EDU_DET')
    train_df['previous_year_rating'] = train_df.previous_year_rating.fillna(0)
    department_dummies = pd.get_dummies(train_df['department'])
    train_df = pd.concat([train_df, department_dummies], axis=1)
    education_dummies = pd.get_dummies(train_df['education'])
    train_df = pd.concat([train_df, education_dummies], axis=1)
    gender_dummies = pd.get_dummies(train_df['gender'])
    train_df = pd.concat([train_df, gender_dummies], axis=1)
    recruitment_channel_dummies = pd.get_dummies(train_df['recruitment_channel'])
    train_df = pd.concat([train_df, recruitment_channel_dummies], axis=1)
    train_df = train_df.drop(['department','region','education','gender','recruitment_channel','employee_id'],axis=1)
    promotoed_df = train_df[train_df.is_promoted == 0]
    not_promoted_df = train_df[train_df.is_promoted == 1]
    subset_df =pd.concat([promotoed_df,not_promoted_df])
    subset_df = subset_df.reset_index(drop=True)
    subset_df = subset_df.reindex(np.random.permutation(subset_df.index))
    return subset_df

In [None]:
traning_df = pd.read_csv('../input/WNS_Train.csv')

In [None]:
main_df = create_dataframe(traning_df)

In [None]:
main_df.rename(mapper={"Bachelor's":'Bachelor',"Below Secondary":'Below_Secondary',"Master's & above":'master_above',
                       "KPIs_met >80%":"KPI","R&D":"R_N_D","Sales & Marketing":"Sales_Marketing","awards_won?":"awards_won"
                              },axis=1,inplace=True)

In [None]:
main_df.columns

In [None]:
selected_features = ['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPI', 'awards_won', 'avg_training_score', 'Analytics',
       'Finance', 'HR', 'Legal', 'Operations', 'Procurement', 'R_N_D',
       'Sales_Marketing', 'Technology', 'Bachelor', 'Below_Secondary',
       'master_above', 'NO_EDU_DET', 'f', 'm', 'other', 'referred',
       'sourcing']
selected_target = ['is_promoted']

In [None]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 50
pd.options.display.float_format = '{:.1f}'.format

In [None]:
def construct_feature_columns(input_features):
    return set([tf.feature_column.numeric_column(my_feature)
              for my_feature in input_features])

In [None]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                            
 
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified.
    if shuffle:
        ds = ds.shuffle(10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [None]:
def train_linear_classifier_model(
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets,
    my_optimizer,
    hidden_units):
    
    periods = 10
    steps_per_period = steps / periods
  
  # Create a linear classifier object.
    my_optimizer = my_optimizer
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
    linear_classifier = tf.estimator.DNNClassifier(
      feature_columns=construct_feature_columns(training_examples),
      optimizer=my_optimizer,
      hidden_units = hidden_units)
   
  
  # Create input functions.
    training_input_fn = lambda: my_input_fn(training_examples, 
                                          training_targets[selected_target], 
                                          batch_size=batch_size)
    predict_training_input_fn = lambda: my_input_fn(training_examples, 
                                                  training_targets[selected_target], 
                                                  num_epochs=1, 
                                                  shuffle=False)
    predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                    validation_targets[selected_target], 
                                                    num_epochs=1, 
                                                    shuffle=False)
  
  # Train the model, but do so inside a loop so that we can periodically assess
  # loss metrics.
    print ("Training model...")
    training_log_losses = []
    validation_log_losses = []
    for period in range (0, periods):
        
        linear_classifier.train(input_fn=training_input_fn,
                                steps=steps_per_period)
       # Take a break and compute predictions.
        
        training_probabilities = linear_classifier.predict(input_fn=predict_training_input_fn)
        training_probabilities = np.array([item['probabilities'] for item in training_probabilities])

        validation_probabilities = linear_classifier.predict(input_fn=predict_validation_input_fn)
        validation_probabilities = np.array([item['probabilities'] for item in validation_probabilities])

        training_log_loss = metrics.log_loss(training_targets, training_probabilities)
        validation_log_loss = metrics.log_loss(validation_targets, validation_probabilities)
        # Occasionally print the current loss.
        print(period, training_log_loss)
        # Add the loss metrics from this period to our list.
        training_log_losses.append(training_log_loss)
        validation_log_losses.append(validation_log_loss)
   
    print ("Model training finished.")
  
  # Output a graph of loss metrics over periods.
    plt.ylabel("LogLoss")
    plt.xlabel("Periods")
    plt.title("LogLoss vs. Periods")
    plt.tight_layout()
    plt.plot(training_log_losses, label="training")
    plt.plot(validation_log_losses, label="validation")
    plt.legend()

    return linear_classifier

In [None]:
not_promoted_df = main_df[main_df['is_promoted'] == 0]
promoted_df = main_df[main_df['is_promoted'] == 1]

In [None]:
promoted_df

In [None]:
final_balanced_df = pd.concat([not_promoted_df[:4668],promoted_df])
final_balanced_df = final_balanced_df.reindex(np.random.permutation(final_balanced_df.index))
final_balanced_df

In [None]:
from sklearn.model_selection import train_test_split
training_examples, validation_examples, training_targets, validation_targets = train_test_split(final_balanced_df[selected_features], final_balanced_df[selected_target],
                                                    test_size = 0.2, random_state = 1)

In [None]:
linear_classifier = train_linear_classifier_model(
    learning_rate=0.02,
    steps=200,
    batch_size=50,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets,
    my_optimizer = tf.train.AdamOptimizer(learning_rate=0.01),
    hidden_units=[50,50])

In [None]:
predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                    validation_targets[selected_target], 
                                                    num_epochs=1, 
                                                    shuffle=False)
evaluation_metrics = linear_classifier.evaluate(input_fn=predict_validation_input_fn)
print(evaluation_metrics['auc'])
print(evaluation_metrics['accuracy'])
validation_probabilities = linear_classifier.predict(input_fn=predict_validation_input_fn)
validation_probabilities = np.array([item['probabilities'][1] for item in validation_probabilities])

In [None]:
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(
    validation_targets, validation_probabilities)
plt.plot(false_positive_rate, true_positive_rate, label="our model")
plt.plot([0, 1], [0, 1], label="random classifier")
_ = plt.legend(loc=2)

In [None]:
predicted_prob = np.round(validation_probabilities)

In [None]:
from sklearn.metrics import f1_score
print(f1_score(validation_targets,predicted_prob))

In [None]:
test_df = pd.read_csv('../input/WNS_test.csv')

In [None]:
def create_test_dataframe(train_df):
    train_df['education']=train_df.education.fillna('NO_EDU_DET')
    train_df['previous_year_rating'] = train_df.previous_year_rating.fillna(0)
    department_dummies = pd.get_dummies(train_df['department'])
    train_df = pd.concat([train_df, department_dummies], axis=1)
    education_dummies = pd.get_dummies(train_df['education'])
    train_df = pd.concat([train_df, education_dummies], axis=1)
    gender_dummies = pd.get_dummies(train_df['gender'])
    train_df = pd.concat([train_df, gender_dummies], axis=1)
    recruitment_channel_dummies = pd.get_dummies(train_df['recruitment_channel'])
    train_df = pd.concat([train_df, recruitment_channel_dummies], axis=1)
    train_df = train_df.drop(['department','region','education','gender','recruitment_channel','employee_id'],axis=1)
    return train_df

In [None]:
pred_features = create_test_dataframe(test_df)

In [None]:
pred_features.head()

In [None]:
predict_validation_input_fn = lambda: my_input_fn(pred_features, 
                                                    test_df['employee_id'], 
                                                    num_epochs=1, 
                                                    shuffle=False)

In [None]:
pred_features.rename(mapper={"Bachelor's":'Bachelor',"Below Secondary":'Below_Secondary',"Master's & above":'master_above',
                       "KPIs_met >80%":"KPI","R&D":"R_N_D","Sales & Marketing":"Sales_Marketing","awards_won?":"awards_won"
                              },axis=1,inplace=True)

In [None]:
validation_probabilities = linear_classifier.predict(input_fn=predict_validation_input_fn)
validation_probabilities = np.array([item['probabilities'][1] for item in validation_probabilities])

In [None]:
validation_df = pd.DataFrame(validation_probabilities,columns=["is_promoted"])

In [None]:
validation_df["is_promoted"]  = validation_df["is_promoted"].apply(lambda x: 1 if x > 0.5 else 0)

In [None]:
validation_df['employee_id'] = test_df['employee_id']

In [None]:
validation_df= validation_df[['employee_id','is_promoted']]

In [None]:
validation_df.to_csv('submission1.csv',index=False)