### In this notebook I will introduce a very simple Machine Learning Framework that handles the basic steps
### Here we have a very quick approach on EDA for data cleansing and reformatting
### Then we use plots to explore the optimal feature settings

In [None]:
import pandas as pd
import numpy as np

In [None]:
train= pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
train.head()

In [None]:
train.shape

In [None]:
# Use pandas_profiling for Explanatory Data Analysis
# View columns with missing data

### Use pandas_profiling for Explanatory Data Analysis
### View columns with missing data

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(train)
profile

In [None]:
train.info()

In [None]:
####city has a high cardinality: 123 distinct values	Warning
####company_size has 5938 (31.0%) missing values	Missing
####company_type has 6140 (32.0%) missing values	Missing
####education_level has 460 (2.4%) missing values	Missing
####enrolled_university has 386 (2.0%) missing values	Missing
####gender has 4508 (23.5%) missing values	Missing
####last_new_job has 423 (2.2%) missing values	Missing
####major_discipline has 2813 (14.7%) missing value

In [None]:
# fill missing entries
train['city'] = train['city'].fillna(value=0)
train['company_size'] = train['company_size'].fillna(value='NA')
train['company_type'] = train['company_type'].fillna(value='NA')
train['education_level'] = train['education_level'].fillna(value='NA')
train['enrolled_university'] = train['enrolled_university'].fillna(value='NA')
train['gender'] = train['gender'].fillna(value='NA')
train['last_new_job'] = train['last_new_job'].fillna(value=0)
train['major_discipline'] = train['major_discipline'].fillna(value='NA')
train['experience'] = train['experience'].fillna(value=0)


In [None]:
# to print list of columns with missing values
train.columns[train.isnull().any()].tolist()

In [None]:
# to ensure we have all categorical variables covered for get_dummies
train.columns

In [None]:
train1 = train.copy()
train1['city1'] = train1['city'].str.replace('city_','')
train1['city1'] = pd.to_numeric(train1['city1'])
train1 = train1.drop(['enrollee_id','city'],axis=1)
train1.education_level = train1.education_level.str.replace(' ','_')
train1.major_discipline = train1.major_discipline.str.replace(' ','_')
train1.company_type = train1.company_type.str.replace(' ','_')
train1.loc[train1.experience == '<1', 'experience'] = 0
train1.loc[train1.experience == '>20', 'experience'] = 21
#train1['experience'] = train1['experience'].fillna(value=0)
train1["experience"] = pd.to_numeric(train1["experience"])

train1.company_size = train1.company_size.str.replace('<','LE')
train1.company_size = train1.company_size.str.replace('/','to')
train1.company_size = train1.company_size.str.replace('+','G')
train1.company_size = train1.company_size.str.replace('-','to')
#train1['company_size'] = train1['company_size'].fillna(value='NA')

train1.last_new_job = train1.last_new_job.str.replace('>','GT')
#train1['last_new_job'] = train1['last_new_job'].fillna(value='NA')

#train1[['company_type','education_level','enrolled_university','gender','major_discipline']] = train1[['company_type','education_level','enrolled_university','gender','major_discipline']].fillna(value='NA')
train1 = pd.get_dummies(data=train1, columns=['gender','relevent_experience','enrolled_university','education_level','major_discipline','company_type','company_size','last_new_job'])

train1['city_development_index'] = pd.to_numeric(train1['city_development_index'])
train1.head()

In [None]:
train1.info()

In [None]:
# to print list of columns with missing values
train1.columns[train1.isnull().any()].tolist()

In [None]:
train1['target'].value_counts()

### Start fitting into model

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(train1, test_size=0.2)
print(train.shape, test.shape)

In [None]:
X_train = train.loc[:, train.columns != 'target']
X_train.head()

In [None]:
y_train = train[['target']]
y_train.head()

In [None]:
X_test = test.loc[:, test.columns != 'target']
X_test.head()

In [None]:
y_test = test[['target']]
y_test.head()

In [None]:
def check_metrics(model, input_y):
    from itertools import islice
    from sklearn.metrics import r2_score, accuracy_score, auc, confusion_matrix, roc_auc_score, precision_score, recall_score
    result_list=[]
    for item in model:
        result_list.append(item[1])
    result_list_r = []
    for item in model:
        result_list_r.append(item[1].round())
    result_cf = confusion_matrix(input_y, result_list_r)
    print(result_cf)
    print('Precision: '+str(precision_score(input_y, result_list_r)))
    print('Recall: '+str(recall_score(input_y, result_list_r)))
    print('Accuracy: '+str(accuracy_score(input_y, result_list_r)))
    print('R2: '+str(r2_score(input_y, result_list)))
    print('AUC: '+str(roc_auc_score(input_y, result_list)))

#### Logistic Regression as benchmark

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression().fit(X_train, y_train)
LR.predict(X_test)

In [None]:
LR_pp = LR.predict_proba(X_test)
LR_pp

In [None]:
check_metrics(LR_pp, y_test)

### Random Forest
#### Check the optimal features by plotting against AUC

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
rf_auc_list=[]
for i in [5,10,15,20,25,30]:
    rf = RandomForestClassifier(n_estimators=1000, random_state=42, criterion='gini', max_depth=i, max_features=20)
    rf.fit(X_train, y_train)
    rf_p = rf.predict(X_test)
    rf_auc_list.append(roc_auc_score(rf_p, y_test))
rf_auc_list

In [None]:
import matplotlib.pyplot as plt
max_depth_list=[5,10,15,20,25,30]
plt.plot(max_depth_list, rf_auc_list, color='r', linestyle='dashed',
         marker='o')
plt.title('Optimal max_depth for RF')
plt.xlabel('max_depth')
plt.ylabel('AUC')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
rf_auc_list=[]
for i in [5,10,15,20,25,30]:
    rf = RandomForestClassifier(n_estimators=1000, random_state=42, criterion='gini', min_samples_leaf=i)
    rf.fit(X_train, y_train)
    rf_p = rf.predict(X_test)
    rf_auc_list.append(roc_auc_score(rf_p, y_test))
rf_auc_list

In [None]:
import matplotlib.pyplot as plt
min_samples_leaf = [5,10,15,20,25,30]
plt.plot(min_samples_leaf, rf_auc_list, color='r', linestyle='dashed', marker='o')
plt.title('Optimal min_samples_leaf')
plt.xlabel('min_samples_leaf')
plt.ylabel('AUC')

#### Fit Random Forest with the optimal features settings

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, random_state=42, criterion='gini', max_features=20, max_depth=10, min_samples_leaf=5)
rf.fit(X_train, y_train)

In [None]:
rf_p = rf.predict_proba(X_test)
rf_p

In [None]:
check_metrics(rf_p, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, random_state=42, criterion='gini', max_features=20, max_depth=10, min_samples_leaf=10)
rf.fit(X_train, y_train)

In [None]:
rf_p = rf.predict_proba(X_test)
rf_p

In [None]:
check_metrics(rf_p, y_test)