[View in Colaboratory](https://colab.research.google.com/github/shravankumar9892/war_of_wizards/blob/master/wns_interview.ipynb)

# Authenticatioon + fetching data

In [0]:
# Do not give up, if it was easy everybody will do it. Hard makes it great!

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.python import keras # Use keras.layers. to make the neural network
import shutil

In [0]:
# Authenticate
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
train_downloaded = drive.CreateFile({'id':'1pDUDjhpnynkRjxRE22nsKdiECRT7MD1K'})
train_downloaded.GetContentFile('train_LZdllcl.csv')
test_downloaded = drive.CreateFile({'id':'1zmn8HGvvIg3uUVv0Q7R3vsK-kLPk9OnZ'})
test_downloaded.GetContentFile('test_2umaH9m.csv')

df_train = pd.read_csv('train_LZdllcl.csv')
df_test = pd.read_csv('test_2umaH9m.csv')

# Preprocessing + splitting

In [0]:
departments = list(np.unique(np.asarray(df_test.department))) #  9 categories
#education_with_nan = np.unique(df_test.education)  # NaN present
recruitment_channel = list(np.unique(df_test.recruitment_channel)) # 3 categories

# NaN values in test data
len(df_train) - len(df_test.previous_year_rating.dropna()) == 1812 and len(df_test) - len(df_test.education.dropna()) == 1034
y_train = df_train['is_promoted']
X_train = df_train[['department', 'education', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score']]
X_test = df_test[['department', 'education', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score']]

In [0]:
# Preprocessing the data
import sklearn
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# department
X_train['department'] = le.fit_transform(X_train['department'].astype('str'))
X_test['department'] = le.fit_transform(X_test['department'].astype('str'))

# recruitment_channel
X_train['recruitment_channel'] = le.fit_transform(X_train['recruitment_channel'].astype('str'))
X_test['recruitment_channel'] = le.fit_transform(X_test['recruitment_channel'].astype('str'))

# education
X_train['education'] = le.fit_transform(X_train['education'].astype('str'))
X_test['education'] = le.fit_transform(X_test['education'].astype('str'))


In [0]:
# As of now dropping off education
#X_train.drop(columns = 'education', inplace = True)
#X_test.drop(columns = 'education', inplace = True)

# Filling in NaN values
X_train.fillna(method = 'ffill', inplace = True)
X_test.fillna(method = 'ffill', inplace = True)

In [0]:
# Create validation data

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.3, random_state = 3, shuffle = True)

# sklearn trees

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import f1_score, make_scorer
score = make_scorer(f1_score, greater_is_better = True)
#estimator = RandomForestClassifier(n_estimators = 200, max_depth = 40, oob_score = True)
#estimator.fit(X_train, y_train)

lr = LogisticRegressionCV(penalty = 'l2', random_state = 3, scoring = score)
lr.fit(X_train, y_train)

In [0]:
# y_hat contains prediction over the training data
y_hat = lr.predict(X_valid)
f1_score(y_hat, np.asarray(y_train).reshape(38365,))
#lr.score(X_train, np.asarray(y_train).reshape(38365,))

In [0]:
TELEGRAM
y_hat.shape

In [0]:
# Accuracy
from sklearn.metrics import f1_score
f1_score(X_train, y_train)*100

In [0]:
# Over validation data
lr.score(X_valid, y_valid)*100

In [0]:
# Prediction

X_test = X_test.loc[1:, :]
pred = estimator.predict(X_test)

# NaN
pred = np.insert(pred, 0, 1) # Assuming promotion is possible for 1st row

In [0]:
# Creating Submission file

employee_id = np.asarray(df_test.employee_id)
df = pd.DataFrame({'employee_id':employee_id, 'is_promoted':pred})

df.to_csv(path_or_buf = 'predictions_hack_education_included.csv', index = False, sep = ',')
from google.colab import files
files.download('predictions_hack_education_included.csv')

# NEURAL NETWORKS

In [0]:
# Creating input pipelines

def decode_line(row):
  cols = tf.decode_csv(row, record_defaults = [[65438], ['Sales & Marketing'], ['region_7'], ['Master\'s & above'], ['f'], ['sourcing'], [1], [35], [5.0], [8], [1], [0], [49], [0]])
  features = {
      'department':cols[1],
      'recruitment_channel':cols[5],
      'no_of_trainings':cols[6],
      'age':cols[7],
      'previous_year_rating':cols[8],
      'length_of_service':cols[9],
      'KPIs_met >80%':cols[10],
      'awards_won?':cols[11],
      'avg_training_score':cols[12]
  }
  label = cols[13]
  return features, label

# Training data
def input_train_fn():
  return tf.data.Dataset.list_files("train_LZdllcl.csv")\
                        .flat_map(tf.data.TextLineDataset)\
                        .map(decode_line)\
                        .shuffle(1000)\
                        .repeat(15)\
                        .batch(128)\
                        .make_one_shot_iterator()\
                        .get_next()

# Test data
def input_test_fn():
  return tf.data.Dataset.list_files("test_2umaH9m.csv")\
                        .flat_map(tf.data.TextLineDataset)\
                        .map(decode_line)\
                        .repeat(15)\
                        .make_one_shot_iterator()\
                        .get_next()

**Model**

In [0]:
optimizer = tf.train.AdamOptimizer(
    learning_rate=0.01,
    beta1=0.9,
    beta2=0.999,
    epsilon=1e-08,
    use_locking=False,
    name='Adam'
)
featcols = [tf.feature_column.categorical_column_with_vocabulary_list(key = 'department', vocabulary_list=departments),\
            tf.feature_column.categorical_column_with_vocabulary_list(key = 'recruitment_channel', vocabulary_list=['sourcing', 'other', '']),\
            tf.feature_column.numeric_column("no_of_trainings"),\
            tf.feature_column.numeric_column("age"),\
            tf.feature_column.numeric_column("previous_year_rating"),\
            tf.feature_column.numeric_column("length_of_service"),\
            tf.feature_column.numeric_column("KPIs_met >80%"),\
            tf.feature_column.numeric_column("awards_won?"),\
            tf.feature_column.numeric_column("avg_training_score")
           ]
model = tf.estimator.LinearRegressor(featcols, './outdir', optimizer = optimizer)

In [0]:
X_train.drop(labels = ['KPIs_met >80%', 'awards_won?'], axis = 1, inplace = True)

# Dataset from Pandas DataFrame

In [0]:
# Second approach for input pipelines

def make_input_fn(X, y, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    x = X,
    y = y,
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

def make_prediction_input_fn(X, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    x = X,
    y = None,
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

def make_feature_cols():
    categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key="Sex", vocabulary_list=["male", "female"], default_value=0)
    return [tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(key = 'department', vocabulary_list=[0,1,2,3,4,5,6,7,8], default_value = 0)),\
            tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(key = 'recruitment_channel', vocabulary_list=[0,1,2])),\
            tf.feature_column.numeric_column("no_of_trainings"),\
            tf.feature_column.numeric_column("age"),\
            tf.feature_column.numeric_column("previous_year_rating"),\
            tf.feature_column.numeric_column("length_of_service"),\
            #tf.feature_column.numeric_column("KPIs_met >80%"),\
            #tf.feature_column.numeric_column("awards_won?"),\
            tf.feature_column.numeric_column("avg_training_score")
           ] 

In [0]:
tf.logging.set_verbosity(tf.logging.INFO)
output_dir = './outdir'

optimizer=tf.train.AdamOptimizer()

shutil.rmtree(output_dir, ignore_errors = True) # start fresh each time
#model = tf.estimator.LinearClassifier(
#      feature_columns = make_feature_cols(), model_dir = output_dir, optimizer = optimizer, n_classes = 2)
model = tf.estimator.DNNClassifier(hidden_units = [6, 6],
      feature_columns = make_feature_cols(), optimizer = optimizer, model_dir = output_dir)
model.train(input_fn = make_input_fn(X = X_train, y = y_train, num_epochs = int(input("Enter number of epochs:"))))

In [0]:
def print_rmse(model, name, X, y):
  metrics = model.evaluate(input_fn = make_input_fn(X, y, 1))
  print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))
  print('f1_score on {} dataset = {}'.format(name, (metrics['precision']*metrics['recall']/(metrics['precision'] + metrics['recall']))))
print_rmse(model, 'validation', X_valid, y_valid)

**Prediction Time :**

In [0]:
predictions = model.predict(input_fn = make_prediction_input_fn(X_test, 1))
predictions

In [0]:
ans = pd.DataFrame(np.asarray(list(predictions)), axis = 1)

In [0]:
ans