In [None]:
import pandas as pd
dataset=pd.read_csv('../input/company-database/promotion_dataset.csv')
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.skew()

In [None]:
dataset.nunique()

In [None]:
dataset.isnull().sum().sort_values(ascending=False)

In [None]:
round((dataset[['previous_year_rating','education']].isnull().sum()*100/len(dataset)),2)

In [None]:
dataset[['previous_year_rating','education']].info()

In [None]:
dataset['previous_year_rating'].value_counts()

In [None]:
import numpy as np
dataset[dataset['previous_year_rating'].isna()].head()

In [None]:
dataset['education'].value_counts()

In [None]:
dataset[dataset['education'].isna()].head()

In [None]:
round(((dataset['is_promoted'].value_counts()*100)/len(dataset)),2)

In [None]:
!pip install pandas-profiling


In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(dataset, title="EDA Report")
profile

In [None]:
data = dataset.sample(frac=0.99, random_state=42)
data_unseen = dataset.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))

In [None]:
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *

In [None]:
data.info()

In [None]:
data.columns

In [None]:
categorical = []
for i in data.columns:
    if (data[i].dtype=='object'):
        categorical.append(i)
print("Categorical Attribute : {}\n ".format(len(categorical)))
for x in range(len(categorical)): 
    print(categorical[x])

In [None]:
data[categorical].nunique()

In [None]:
round(data['region'].value_counts()*100/len(data),2)

In [None]:
round(data['department'].value_counts()*100/len(data),2)

In [None]:
promotion = setup(data = data, target = 'is_promoted',
                  session_id=1,
                  train_size = 0.8, # training over 80% of available data
                  ordinal_features = {'education' : ["Below Secondary", "Bachelor's", "Master's & above"]}, #ordinal feature
                  ignore_features=['employee_id'], # not to consider this feature for training model
                  categorical_features=['department','gender','recruitment_channel','awards_won?'], #categorical features
                  numeric_features=['no_of_trainings', 'age', 'previous_year_rating',
                                    'length_of_service',  'avg_training_score'],# numerical feature
                  transformation = True,#Transformation changes the shape of the distribution such that the transformed data can be represented by normal distribution
                  normalize = True, #rescale the values of numeric columns
                  handle_unknown_categorical = True, 
                  unknown_categorical_method = 'most_frequent',
                  remove_outliers = True,
                  remove_multicollinearity = True, #drop one of the two features that are highly correlated with each other
                  ignore_low_variance = True,#all categorical features with statistically insignificant variances are removed from the dataset.
                  combine_rare_levels = True,# all levels in categorical features below the threshold defined in rare_level_threshold param are combined together as a single level
                  high_cardinality_features = ['region','department'],#compressed into fewer levels by passing them as a list of column names with high cardinality. 
                  fix_imbalance = True,# to fix the imbalance
                  numeric_imputation='median',
                  categorical_imputation='mode')

In [None]:
catboost= create_model('catboost')

In [None]:
print(catboost)

In [None]:
tuned_catboost = tune_model(catboost,optimize = 'Kappa') #tuned on  Kappa

In [None]:
evaluate_model(tuned_catboost) #Graphical plot 

In [None]:
predict_model(tuned_catboost)# Test data evaluation 

In [None]:
final_catboost = finalize_model(tuned_catboost) # Final model 
final_catboost

In [None]:
predict_model(final_catboost)#final test model evaluation 

In [None]:
unseen_predictions = predict_model(final_catboost, data=data_unseen)#evaluation on unseen data
unseen_predictions.head()

In [None]:
print("Confidence Score :   {}".format(round(unseen_predictions.Score.mean(),2)))#Confidence Score