In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
test_df = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")

test_enrollee_ids=test_df.enrollee_id

In [None]:
df

In [None]:
df.describe()

## Missing values

In [None]:
df.isna().sum()

In [None]:
df.enrolled_university.value_counts()

In [None]:
#df['enrolled_university'] = df.enrolled_university.fillna(df.enrolled_university.value_counts().index[0])

In [None]:
df.fillna("-", inplace=True)
test_df.fillna("-", inplace=True)

## Preprocessing the data

In [None]:
df

In [None]:
def get_experience(x):
    if x == '>20':
        return 25
    elif x == '<1':
        return 0
    elif x == '-':
        return np.NaN
    else:
        return int(x)

In [None]:
df['experience'] = df.experience.apply(get_experience)
test_df['experience'] = test_df.experience.apply(get_experience)

In [None]:
df['experience'] = df.experience.fillna(df.experience.median())
test_df['experience'] = test_df.experience.fillna(df.experience.median())

In [None]:
df.company_size.value_counts()

In [None]:
df.company_size.value_counts().plot(kind='bar')

In [None]:
def get_company_size(x):
    if x == "<10":
        return 5
    elif x == '10/49':
        return 20
    elif x == '50-99':
        return 75
    elif x == '100-500':
        return 300
    elif x == '500-999':
        return 750
    elif x == '1000-4999':
        return 3000
    elif x == '5000-9999':
        return 7500
    elif x == '10000+':
        return 13000
    else:
        return 75

In [None]:
df['company_size'] = df.company_size.apply(get_company_size)
test_df['company_size'] = test_df.company_size.apply(get_company_size)

In [None]:
df.company_size.value_counts()

In [None]:
df.last_new_job.value_counts()

In [None]:
def get_last_new_job(x):
    if x == "never":
        return 0
    elif x == ">4":
        return 6
    elif x == "-":
        return 1
    else:
        return int(x)

In [None]:
df['last_new_job'] = df.last_new_job.apply(get_last_new_job)
test_df['last_new_job'] = test_df.last_new_job.apply(get_last_new_job)

In [None]:
df.last_new_job.value_counts()

In [None]:
df.head()

## Encoding categical data

In [None]:
df.info()

In [None]:
categorical = ["gender", "relevent_experience", "enrolled_university", "education_level", "major_discipline", "company_type"]
df = pd.get_dummies(df, columns=categorical)
test_df = pd.get_dummies(test_df, columns=categorical)

In [None]:
df.drop(['enrollee_id', 'city'], axis=1,  inplace=True)
test_df.drop(['enrollee_id', 'city'], axis=1,  inplace=True)

In [None]:
cols_to_drop = ['gender_-', 'enrolled_university_-', 'education_level_-', 'major_discipline_-', 'company_type_-']
df.drop(cols_to_drop, axis=1, inplace=True)
test_df.drop(cols_to_drop, axis=1, inplace=True)

## Splitting the dataset into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X, y = df.drop(['target'], axis=1), df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Scaling data

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

rescaled_test = scaler.fit_transform(test_df)

In [None]:
from sklearn.model_selection import GridSearchCV

rescaledX = scaler.fit_transform(X)

In [None]:
#best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
#print("Best: %f using %s" % (best_score, best_params))

## Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

est = GradientBoostingClassifier(learning_rate=0.05, max_depth=8, min_samples_leaf=100)

In [None]:
cross_val_score(est, rescaledX, y, cv=5).mean()

In [None]:
est_result = est.fit(rescaledX, y)

In [None]:
importances = pd.Series(est_result.feature_importances_, index=X.columns)

In [None]:
importances[importances > 0.01].sort_values(ascending=False).plot(kind="bar")

In [None]:
est.predict(rescaledX)

In [None]:
predictions = est.predict(rescaled_test)

In [None]:
pd.Series(predictions).value_counts()

In [None]:
result_df = pd.concat([test_enrollee_ids, pd.Series(predictions)], axis=1)

In [None]:
result_df.to_csv('submission.csv',index=False)

In [None]:
#parameters = {
#    'learning_rate': [0.1, 0.05, 0.02, 0.01],
#    'max_depth': [4, 6, 8],
#    'min_samples_leaf': [20, 50,100,150]
#}

In [None]:
#grid_gb = GridSearchCV(estimator=est, param_grid=parameters, cv=5)

#grid_model_result = grid_gb.fit(X, y)

In [None]:
#best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
#application/jsonprint("Best: %f using %s" % (best_score, best_params))