In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler

In [None]:
# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau

In [None]:
headcount = pd.read_excel('../input/Demandv1.1.xlsx',sheet_name='Headcount')
# billable data shows actual demand of past years
headcount = headcount[headcount.Status == 'Billable']

In [None]:
# converting Technologies into one hot encoding
from re import split
cleaned = headcount.set_index('Employee Code').SkillList.str.split(r',\s*(?![^()]*\))', expand=True).stack()
one_hot_coded_df = pd.get_dummies(cleaned).groupby(level=0).sum()
one_hot_coded_df.head()

In [None]:
# making index as Employee code for main data frame
head_df = pd.read_excel('../input/Demandv1.1.xlsx',sheet_name='Headcount',index_col='Employee Code')

In [None]:
# merging to daat frame
merged_df = pd.merge(head_df,one_hot_coded_df,left_index=True,right_index=True)

In [None]:
# removing index and unwanted columns
removed_index = merged_df.reset_index(drop=True)

In [None]:
final_df = removed_index.drop(['Region','  Last Name','Status','Market Unit','SkillList'],axis=1).set_index('Local Date of Joining')
final_df['year'] = final_df.index.year
final_df['month']=final_df.index.month
grouped_df = final_df.reset_index(drop=True).groupby(['year','month','Location','Designation']).sum()

In [None]:
def predict_loc_tech(location,technology,desgination):
    if location is None and desgination is None:
        location_df = grouped_df
    elif location is None and desgination:
        location_df = grouped_df[(grouped_df.index.get_level_values('Designation') == desgination)]
    elif desgination is None and location:
        location_df = grouped_df[(grouped_df.index.get_level_values('Location') == location)]
    elif location and desgination:
        location_df = grouped_df[(grouped_df.index.get_level_values('Location') == location) & (grouped_df.index.get_level_values('Designation') == desgination)]
    location_df['day'] = 1
    location_df = location_df.reset_index()
    location_df = location_df[location_df.year >= 2010]
    location_df['Date']=pd.to_datetime(location_df[['year','month','day']])
    loc_tech_df = location_df.set_index('Date').resample('M').sum()[[technology]]
    return loc_tech_df

In [None]:
find_df = predict_loc_tech(None,'Java',None) #4.891495
find_df = find_df.apply(np.log1p)

In [None]:
find_df.sort_index(ascending=True,inplace=True)
find_df.head()

In [None]:
plt.figure(figsize=(15, 5));
plt.plot(find_df, color='red', label='Java')
plt.show()

In [None]:
shift_months = 12
target_names = 'Java'
df_targets = find_df[target_names].shift(-shift_months)

In [None]:
test_df = find_df.tail(12)

In [None]:
traning_df = find_df[:-12]

In [None]:
x_data = traning_df.values
print(x_data.shape)

In [None]:
y_data = df_targets.values[:-shift_months].reshape(88,1)
print(y_data.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val = train_test_split(x_data,y_data,test_size = 0.1,shuffle=False)

In [None]:
# Number of input features
num_x_signals = x_data.shape[1]
num_x_signals

In [None]:
# number of target
num_y_signals = y_data.shape[1]
num_y_signals

In [None]:
# scale Data
# x_scalerx_scaler = MinMaxScaler()
# x_train_scaled = x_scalerx_scaler.fit_transform(x_train)
x_train_scaled =x_train
# x_scaled_val = x_scalerx_scaler.fit_transform(x_val)
x_scaled_val = x_val

In [None]:
print(x_train_scaled.shape)
print(y_train.shape)

In [None]:
def batch_generator(batch_size, sequence_length):
    """
    Generator function for creating random batches of training-data.
    """

    # Infinite loop.
    while True:
        # Allocate a new array for the batch of input-signals.
        x_shape = (batch_size, sequence_length, num_x_signals)
        x_batch = np.zeros(shape=x_shape, dtype=np.float16)

        # Allocate a new array for the batch of output-signals.
        y_shape = (batch_size, sequence_length, num_y_signals)
        y_batch = np.zeros(shape=y_shape, dtype=np.float16)

        # Fill the batch with random sequences of data.
        for i in range(batch_size):
            # Get a random start-index.
            # This points somewhere into the training-data.
            idx = np.random.randint(len(x_train_scaled) - sequence_length)
            
            # Copy the sequences of data starting at this index.
            x_batch[i] = x_train_scaled[idx:idx+sequence_length]
            y_batch[i] = y_train[idx:idx+sequence_length]
        
        yield (x_batch, y_batch)

In [None]:
# batch size and sequence
batch_size = 15
sequence_length= 12*1 # we will go four years data at a time

In [None]:
generator = batch_generator(batch_size=batch_size,
                            sequence_length=sequence_length)

In [None]:
x_batch, y_batch = next(generator)

In [None]:
validation_data = (np.expand_dims(x_scaled_val, axis=0),
                   np.expand_dims(y_val, axis=0))

In [None]:
model = Sequential()
model.add(GRU(units=256,
              return_sequences=True,
              input_shape=(None, num_x_signals,)))
model.add(Dense(num_y_signals, activation='tanh'))

In [None]:
optimizer = RMSprop(lr=1e-3)

In [None]:
def loss_mse_warmup(y_true, y_pred):
    """
    Calculate the Mean Squared Error between y_true and y_pred,
    but ignore the beginning "warmup" part of the sequences.
    
    y_true is the desired output.
    y_pred is the model's output.
    """


    # Calculate the MSE loss for each value in these tensors.
    # This outputs a 3-rank tensor of the same shape.
    loss = tf.losses.mean_squared_error(labels=y_true,
                                        predictions=y_pred)

    # Keras may reduce this across the first axis (the batch)
    # but the semantics are unclear, so to be sure we use
    # the loss across the entire tensor, we reduce it to a
    # single scalar with the mean function.
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

In [None]:
model.compile(loss=loss_mse_warmup,
              optimizer=optimizer)

In [None]:
model.summary()

In [None]:
# call back function
path_checkpoint = '23_checkpoint.keras'
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)
callback_early_stopping  = EarlyStopping(monitor='val_loss',
                                        patience=5, verbose=1)
callback_reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.1,
                                       min_lr=1e-4,
                                       patience=2,
                                       verbose=1)
callback_tensorboard = TensorBoard(log_dir='./23_logs/',
                                   histogram_freq=0,
                                   write_graph=False)

In [None]:
callbacks = [callback_early_stopping,
             callback_checkpoint,
             callback_tensorboard,
             callback_reduce_lr]

In [None]:
%%time
model.fit_generator(generator=generator,
                    epochs=50,
                    steps_per_epoch=100,
                    validation_data=validation_data,
                    callbacks=callbacks)

In [None]:
result = model.evaluate(x=np.expand_dims(x_scaled_val, axis=0),
                        y=np.expand_dims(y_val, axis=0))

In [None]:
print("loss (test-set):", result)

In [None]:
def plot_comparison(start_idx, length=100, train=True):
    """
    Plot the predicted and true output-signals.
    
    :param start_idx: Start-index for the time-series.
    :param length: Sequence-length to process and plot.
    :param train: Boolean whether to use training- or test-set.
    """
    
    if train:
        # Use training-data.
        x = x_train_scaled
        y_true = y_train
    else:
        # Use test-data.
        x = x_scaled_val
        y_true = y_val
    
    # End-index for the sequences.
    end_idx = start_idx + length
    
    # Select the sequences from the given start-index and
    # of the given length.
    x = x[start_idx:end_idx]
    y_true = y_true[start_idx:end_idx]
    
    # Input-signals for the model.
    x = np.expand_dims(x, axis=0)
    print(x.shape)
    # Use the model to predict the output-signals.
    y_pred = model.predict(x)
    print(y_pred)
    
    # The output of the model is between 0 and 1.
    # Do an inverse map to get it back to the scale
    # of the original data-set.
    y_pred_rescaled = y_pred[0]
    
    
    # For each output-signal.
    for signal in range(len(target_names)):
        # Get the output-signal predicted by the model.
        signal_pred = y_pred_rescaled[:, signal]
        print(signal_pred)
        
        # Get the true output-signal from the data-set.
        signal_true = y_true[:, signal]
        print(signal_true)

        # Make the plotting-canvas bigger.
        plt.figure(figsize=(15,5))
        
        # Plot and compare the two signals.
        plt.plot(signal_true, label='true')
        plt.plot(signal_pred, label='pred')
        
        # Plot grey box for warmup-period.
        p = plt.axvspan(0, warmup_steps, facecolor='black', alpha=0.15)
        
        # Plot labels etc.
        plt.ylabel(target_names[signal])
        plt.legend()
        plt.show()