[View in Colaboratory](https://colab.research.google.com/github/shravankumar9892/war_of_wizards/blob/master/wns_interview.ipynb)

# Authenticatioon + fetching data

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.python import keras # Use keras.layers. to make the neural network
import shutil

# gpu 
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))    

In [0]:
# Authenticate
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
train_downloaded = drive.CreateFile({'id':'1sJ6TOntkRu1WXGm15I1AUqItAmKoA49i'})
train_downloaded.GetContentFile('train_LZdllcl.csv')
test_downloaded = drive.CreateFile({'id':'1kBx6SWGFHZ3QmwkxnDjSLr9PWI7bDccL'})
test_downloaded.GetContentFile('test_2umaH9m.csv')

df_train = pd.read_csv('train_LZdllcl.csv')
df_test = pd.read_csv('test_2umaH9m.csv')

# Preprocessing + splitting

In [0]:
# NaN values in test data
y_train = df_train['is_promoted']
X_train = df_train[['department', 'education', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met_80', 'awards_won', 'avg_training_score']]
X_test = df_test[['department', 'education', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met_80', 'awards_won', 'avg_training_score']]

In [0]:
# Preprocessing the data
import sklearn
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Filling in NaN values
X_train['previous_year_rating'].fillna(0, inplace = True)
X_test['previous_year_rating'].fillna(0, inplace = True)

X_train.fillna(X_train.min(), inplace = True)
X_test.fillna(X_test.min(), inplace = True)

# department
X_train['department'] = le.fit_transform(X_train['department'].astype('str'))
X_test['department'] = le.fit_transform(X_test['department'].astype('str'))

# gender
X_train['gender'] = le.fit_transform(X_train['gender'].astype('str'))
X_test['gender'] = le.fit_transform(X_test['gender'].astype('str'))

# recruitment_channel
X_train['recruitment_channel'] = le.fit_transform(X_train['recruitment_channel'].astype('str'))
X_test['recruitment_channel'] = le.fit_transform(X_test['recruitment_channel'].astype('str'))

# education
X_train['education'] = le.fit_transform(X_train['education'].astype('str'))
X_test['education'] = le.fit_transform(X_test['education'].astype('str'))


**Feature Engineering**

In [0]:
X_train['avg_training_score'] = X_train['avg_training_score']**2

In [0]:
# Create validation data

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.3, random_state = 3, shuffle = True)

# TensorBoard

In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

In [0]:
LOG_DIR = './outdir'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

In [0]:
get_ipython().system_raw('./ngrok http 6006 &')

In [0]:
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

In [0]:
from keras.callbacks import TensorBoard
tbCallBack = TensorBoard(log_dir='./outdir', histogram_freq=1,
                         write_graph=True,
                         write_grads=True,
                         batch_size=128,
                         write_images=True)

# TensorFlow 

In [0]:
# Second approach for input pipelines

def make_input_fn(X, y, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    x = X,
    y = y,
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

def make_prediction_input_fn(X, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    x = X,
    y = None,
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

def make_feature_cols():
    categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key="Sex", vocabulary_list=["male", "female"], default_value=0)
    return [tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(key = 'department', vocabulary_list=[0,1,2,3,4,5,6,7,8], default_value = 0)),\
            tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(key = 'recruitment_channel', vocabulary_list=[0,1,2])),\
            tf.feature_column.numeric_column("no_of_trainings"),\
            tf.feature_column.numeric_column("age"),\
            tf.feature_column.numeric_column("previous_year_rating"),\
            tf.feature_column.numeric_column("length_of_service"),\
            tf.feature_column.numeric_column("KPIs_met_80"),\
            tf.feature_column.numeric_column("awards_won"),\
            tf.feature_column.numeric_column("avg_training_score")
           ] 

In [0]:
tf.logging.set_verbosity(tf.logging.INFO)
output_dir = './outdir'
shutil.rmtree(output_dir, ignore_errors = True) # start fresh each time
#model = tf.estimator.LinearClassifier(
#      feature_columns = make_feature_cols(), model_dir = output_dir, optimizer = optimizer, n_classes = 2)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

def metric_auc(labels, predictions):
    return {
        'auc_precision_recall': tf.metrics.auc(
            labels=labels, predictions=predictions['logistic'], num_thresholds=200,
            curve='PR', summation_method='careful_interpolation')
    }

  
optimizer=tf.train.AdamOptimizer()

model = tf.estimator.DNNClassifier(hidden_units = [8, 6, 4, 3],
      feature_columns = make_feature_cols(), optimizer = optimizer, model_dir = output_dir)
model = tf.contrib.estimator.add_metrics(model, metric_auc)
with tf.device('/gpu:0'):
  model.train(input_fn = make_input_fn(X = X_train, y = y_train, num_epochs = int(input("Enter number of epochs:"))), callbacks=[tbCallBack])

In [0]:
def print_rmse(model, name, X, y):
  metrics = model.evaluate(input_fn = make_input_fn(X, y, 1))
  print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))
  print('f1_score on {} dataset = {}'.format(name, (2*metrics['precision']*metrics['recall']/(metrics['precision'] + metrics['recall']))))
print_rmse(model, 'validation', X_valid, y_valid)

**Prediction Time :**

In [0]:
# Generator object
pred = []
predictions = model.predict(input_fn = make_prediction_input_fn(X_test, 1))
n = list(predictions)
for i in range(len(n)):
  pred.append(n[i]["logits"][0])

In [0]:
# Creating Submission file
from google.colab import files

employee_id = np.asarray(df_test.employee_id)
df = pd.DataFrame({'employee_id':employee_id, 'is_promoted':np.asarray(ans1)})

df.to_csv(path_or_buf = 'predictions_hack_education_included.csv', index = False, sep = ',')
from google.colab import files
files.download('predictions_hack_education_included.csv')