In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
#Importing libraries

import pandas as pd 
import numpy as np
import seaborn as sns

# Scikit-learn library: For Random Forests
import time
import random
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from urllib.request import urlopen 

# Matplotlib library to plot the charts
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
%matplotlib inline

plt.style.use('ggplot')
pd.set_option('display.max_columns', 500) 

In [None]:
train = pd.DataFrame(pd.read_csv('../input/ee-769-assignment1/train.csv'))
test = pd.DataFrame(pd.read_csv('../input/ee-769-assignment1/test.csv'))

In [None]:
X = train.drop(["Attrition", "EmployeeCount"],axis=1)
test = test.drop("EmployeeCount", axis = 1)
y = train["Attrition"]

In [None]:
# convert objects to numbers by pandas.get_dummies
X = pd.get_dummies(X, columns=["Gender", "BusinessTravel", "Department", "EducationField", "JobRole", "MaritalStatus", "OverTime"])
X_test = pd.get_dummies(test, columns=["Gender", "BusinessTravel", "Department", "EducationField", "JobRole", "MaritalStatus", "OverTime"])

In [None]:
# Set the random state for reproducibility
fit_rf = RandomForestClassifier(random_state=42)

In [None]:
# Splitting dataset into training set and validation set for better generalisation
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
np.random.seed(42)
start = time.time()

param_dist = {'n_estimators': (10,100), 'max_depth': [2, 3, 4, 5],
              'bootstrap': [True, False],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'criterion': ['gini', 'entropy']}

cv_rf = GridSearchCV(fit_rf, cv = 10,
                     param_grid=param_dist, 
                     n_jobs = 3)

cv_rf.fit(X_train, y_train)
print('Best Parameters using grid search: \n', 
      cv_rf.best_params_)
end = time.time()
print('Time taken in grid search: {0: .2f}'.format(end - start))

In [None]:
# Set best parameters given by grid search 
fit_rf.set_params(n_estimators = 100, bootstrap = 'False', criterion = 'gini',
                  max_features = 'auto', 
                  max_depth = 5)

In [None]:
fit_rf.fit(X_train, y_train)

In [None]:
def variable_importance(fit):
    """
    Purpose
    ----------
    Checks if model is fitted CART model then produces variable importance
    and respective indices in dictionary.

    Parameters
    ----------
    * fit:  Fitted model containing the attribute feature_importances_

    Returns
    ----------
    Dictionary containing arrays with importance score and index of columns
    ordered in descending order of importance.
    """
    try:
        if not hasattr(fit, 'fit'):
            return print("'{0}' is not an instantiated model from scikit-learn".format(fit)) 

        # Captures whether the model has been trained
        if not vars(fit)["estimators_"]:
            return print("Model does not appear to be trained.")
    except KeyError:
        print("Model entered does not contain 'estimators_' attribute.")

    importances = fit.feature_importances_
    indices = np.argsort(importances)[::-1]
    return {'importance': importances,
            'index': indices}

In [None]:
var_imp_rf = variable_importance(fit_rf)

importances_rf = var_imp_rf['importance']

indices_rf = var_imp_rf['index']

In [None]:
def print_var_importance(importance, indices, name_index):
    """
    Purpose
    ----------
    Prints dependent variable names ordered from largest to smallest
    based on information gain for CART model.
    Parameters
    ----------
    * importance: Array returned from feature_importances_ for CART
                models organized by dataframe index
    * indices: Organized index of dataframe from largest to smallest
                based on feature_importances_
    * name_index: Name of columns included in model

    Returns
    ----------
    Prints feature importance in descending order
    """
    print("Feature ranking:")

    for f in range(0, indices.shape[0]):
        i = f
        print("{0}. The feature '{1}' has a Mean Decrease in Impurity of {2:.5f}"
              .format(f + 1,
                      list(X_train.columns.values)[indices[i]],
                      importance[indices[f]]))

In [None]:
print_var_importance(importances_rf, indices_rf, list(X_train.columns.values))

In [None]:
def variable_importance_plot(importance, indices, names_index):
    """
    Purpose
    ----------
    Prints bar chart detailing variable importance for CART model
    NOTE: feature_space list was created because the bar chart
    was transposed and index would be in incorrect order.

    Parameters
    ----------
    * importance: Array returned from feature_importances_ for CART
                models organized by dataframe index
    * indices: Organized index of dataframe from largest to smallest
                based on feature_importances_
    * name_index: Name of columns included in model

    Returns:
    ----------
    Returns variable importance plot in descending order
    """
    index = np.arange(len(names_index))

    importance_desc = sorted(importance)
    feature_space = []
    for i in range(indices.shape[0] - 1, -1, -1):
        feature_space.append(names_index[indices[i]])

    fig, ax = plt.subplots(figsize=(10, 10))

    ax.set_facecolor('#fafafa')
    plt.title('Feature importances for Random Forest Model\
    \nBreast Cancer (Diagnostic)')
    plt.barh(index,
             importance_desc,
             align="center",
             color = '#875FDB')
    plt.yticks(index,
               feature_space)

    plt.ylim(-1, 30)
    plt.xlim(0, max(importance_desc) + 0.01)
    plt.xlabel('Mean Decrease in Impurity')
    plt.ylabel('Feature')

    plt.show()
    plt.close()

In [None]:
variable_importance_plot(importances_rf, indices_rf, list(X_train.columns.values))

In [None]:
y_pred = fit_rf.predict(X_test)
res = pd.DataFrame({"ID": test["ID"], "Attrition": y_pred})
y_pred

In [None]:
with open('output.csv', 'w') as csv_file:
    res.to_csv(path_or_buf=csv_file, index = False)