In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/predicting-red-hat-business-value/act_train.csv.zip')
train_df.info()

In [None]:
people_df = pd.read_csv('/kaggle/input/predicting-red-hat-business-value/people.csv.zip')
people_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
people_df.isnull().sum()

In [None]:
train_df.head()

In [None]:
people_df.head()

In [None]:
train_df = train_df.drop(['char_1', 'char_2', 'char_3', 
                          'char_4', 'char_5', 'char_6',
                         'char_7', 'char_8', 'char_9',
                         'char_10'], axis=1)
train_df.info()

In [None]:
train_df.set_index('people_id')
people_df.set_index('people_id')
df = pd.merge(train_df, people_df)
df.info()

In [None]:
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['isweekend'] = (df['date'].dt.weekday >= 5).astype(int)
    df = df.drop('date', axis = 1)
    df.head()

In [None]:
df.head()

In [None]:
import seaborn as sns

print(df['outcome'].value_counts())
sns.countplot(x="outcome", data=df)

In [None]:
categorical_features = []
numeric_features = []
features = df.columns.values.tolist()
for col in features:
    if df[col].dtype != 'object': 
        numeric_features.append(col)
    else:
        categorical_features.append(col)

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encoding categorical features
for col in categorical_features:
    le = LabelEncoder()
    le.fit(list(df[col].astype(str).values))
    df[col] = le.transform(list(df[col].astype(str).values))

In [None]:
df.head()

In [None]:
y = df['outcome']
X = df.drop('outcome', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
import xgboost as xgb
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=10, learning_rate=0.05, use_label_encoder=False)
gbm = gbm.fit(X_train, y_train)

In [None]:
print("Accuracy on training set: {:.3f}".format(gbm.score(X_train, y_train))) 
print("Accuracy on test set: {:.3f}".format(gbm.score(X_test, y_test)))

In [None]:
features = X.columns.values.tolist()
importances = gbm.feature_importances_
indices = np.argsort(importances)

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

figure(figsize = (10, 10), dpi = 80)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()