I decided to conduct a fairly simple data analysis, and also built a simple model.

In [None]:
import pandas as pd
import numpy as np
data_train=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
data_test=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
data_train.head()

In [None]:
data_train.duplicated().sum()

In [None]:
data_train.info()

In [None]:
data_train.isna().sum()

So we don't have any duplicates, also we can see 10 categorical features, and we have a lot of missing values. Below is a graphical analysis.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
sns.countplot(data_train['education_level'],hue=data_train['relevent_experience'])

In [None]:
plt.figure()
sns.countplot(x = data_train['gender'])
plt.show()

In [None]:
sns.countplot(x=data_train['target'], hue=data_train['gender'])

Let's look at the unique values of each categorical attribute

In [None]:
for i in data_train:
    if data_train[i].dtype == 'object':
        print(i,data_train[i].unique())

In [None]:
from sklearn.impute import SimpleImputer
import numpy as np
for i in data_train:
    if data_train[i].isna().sum()>0:
        imr=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
        imr=imr.fit(data_train[[i]])
        imputed_data=imr.transform(data_train[[i]])
        data_train[i]=imputed_data

I decided to fill in the missing values with the most common ones. And categorical variables are processed using Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
for c in data_train.columns:
    le = LabelEncoder()
    if data_train.dtypes[c] == object:
        le.fit(data_train[c].astype(str))
        data_train[c] = le.transform(data_train[c].astype(str))

In [None]:
y=data_train['target']
X=data_train.drop(['target','enrollee_id'],axis=1)
print(y.value_counts())

We can clearly see the imbalance in the classes. I decided to choose the ADASYN sampling algorithm, since instead of the entire sample being linearly correlated with the parent, they have a slightly larger variance in them, meaning they are slightly scattered and similar to the real data.

In [None]:
from sklearn import preprocessing
norm = preprocessing.StandardScaler()
ndf=norm.fit_transform(X)
X = pd.DataFrame(ndf, index=X.index, columns=X.columns)
X.head(10)

In [None]:
from imblearn.over_sampling import ADASYN 
X_resampled, y_resampled = ADASYN().fit_resample(X, y)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(X_resampled,y_resampled,train_size=0.7, random_state=42)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
params = {
    'learning_rate': [0.05],
    'num_leaves': [90,140,200],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'max_depth' : [3,4,5,6,7,8],
    'random_state' : [42], 
    'colsample_bytree' : [0.5,0.6,0.7,0.8,1.0],
    'subsample' : [0.5,0.6,0.7,0.8,1.0],
    'min_split_gain' : [0.01],
    'min_data_in_leaf':[10],
    'metric':['auc']
    }
clf = lgb.LGBMClassifier()
RSCV = RandomizedSearchCV(clf,params,verbose=3,cv=10,n_jobs = -1,n_iter=10)
RSCV.fit(X_train,y_train)


In [None]:
y_pred=RSCV.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred)

In [None]:
data_test.info()

In [None]:
data_test.isna().sum()

In [None]:
from sklearn.impute import SimpleImputer
import numpy as np
for i in data_test:
    if data_test[i].isna().sum()>0:
        imr=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
        imr=imr.fit(data_test[[i]])
        imputed_data=imr.transform(data_test[[i]])
        data_test[i]=imputed_data

In [None]:
from sklearn.preprocessing import LabelEncoder
for c in data_test.columns:
    le = LabelEncoder()
    if data_test.dtypes[c] == object:
        le.fit(data_test[c].astype(str))
        data_test[c] = le.transform(data_test[c].astype(str))

In [None]:
data_test['target']=''
X=data_test.drop(['target','enrollee_id'],axis=1)

In [None]:
from sklearn import preprocessing
norm = preprocessing.StandardScaler()
ndf=norm.fit_transform(X)
X = pd.DataFrame(ndf, index=X.index, columns=X.columns)
X.head(10)

In [None]:
data_test['target']=RSCV.predict(X)

In [None]:
subm=data_test[['enrollee_id','target']]
subm.head(5)

In [None]:
subm.to_csv('./subm.csv',index=False)

Please UPVOTE, if you like it :D