In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# important libraries
import numpy as np # linear algebra
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import tree

from kmodes.kprototypes import KPrototypes
from sklearn.metrics import silhouette_score

In [None]:
train = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')


In [None]:
target_col = 'target'

print('Total number of rows =',train.shape[0])
print('Total number of columns =',train.shape[1])
print('===================================')
print(train.info())
print('===================================')
for i in train.columns:
    null_value = train[i].isnull().sum()
    if null_value > 0 :
        print(f'This column {i} has = {null_value}')
        
# Majority of our data is object type

In [None]:
# last_new_job 
print(train.last_new_job.value_counts())
print(train.last_new_job.shape)
train.last_new_job.fillna('never',inplace=True)

In [None]:
# company_size
print(train.company_size.value_counts())
print(train.company_size.shape)
print('==================================')
# company_type
print(train.company_type.value_counts())
print(train.company_type.shape)


# we will deal with them together, if both are null values let create our own company
# with company_size = 100, company_type = other
train.company_size.fillna('0-100',inplace=True)
train.company_type.fillna('Other',inplace=True)

In [None]:
# experience
print(train.experience.value_counts())
print(train.experience.shape)

# Lets fill null values with 0 
train.experience.fillna(0 ,inplace=True)

In [None]:
# major_discipline
print(train.major_discipline.value_counts())
print(train.major_discipline.shape)

# With high school as education level, major discipline has null values 
# lets give another category to them as 'Not_applicable'
train.major_discipline.fillna('Not_applicable',inplace=True)

In [None]:
# education_level
print(train.education_level.value_counts())
print(train.education_level.shape)
# if you notice if education level is null then major too is none, lets drop them as per now
train.dropna(subset=['education_level'], inplace=True)
print(train.education_level.shape)

In [None]:
# enrolled_university
print(train.enrolled_university.value_counts())
# lets assume null values that they have not enrolled
train.enrolled_university.fillna('no_enrollment',inplace=True)

In [None]:
# lets impute null values after understanding each column one by one
# gender
print(train.gender.value_counts())
# as there are 4508 null values, that mean either they forget of mention or they dont want to reveal
# lets give them with other category only
train.gender.fillna('Other',inplace=True)

In [None]:
# As we are done with filling null values, Lets do some visualization to understand data
# our main agenda to find which factor contribute more towards our target col i.e target

In [None]:
train.city.value_counts()

In [None]:

fig, axs = plt.subplots(nrows=4,ncols=2, figsize=(15,20))
sns.countplot(x="relevent_experience",hue='target', data=train, ax=axs[0,0])
sns.countplot(x="enrolled_university",hue='target', data=train, ax=axs[0,1])
sns.countplot(x="education_level",hue='target', data=train, ax=axs[1,0])
sns.countplot(x="major_discipline",hue='target', data=train, ax=axs[1,1])
sns.countplot(x="experience",hue='target', data=train, ax=axs[2,0])
sns.countplot(x="company_size",hue='target', data=train, ax=axs[2,1])
sns.countplot(x="company_type",hue='target', data=train, ax=axs[3,0])
sns.countplot(x="last_new_job",hue='target', data=train, ax=axs[3,1])

In [None]:
cdi = train.sort_values(by='city_development_index', ascending=True)[:1000]
figure = plt.figure(figsize=(10,6))
sns.barplot(y=cdi.city, x=cdi.city_development_index)
plt.xticks()
plt.xlabel('city_development_index')
plt.ylabel('city')
plt.title('City by city development index')
plt.show()

In [None]:
sns.distplot(train['city_development_index'])

In [None]:
! pip install dython
from dython.model_utils import roc_graph
from dython.nominal import associations

# we will use this for finding corrleation between cateogrical columns

In [None]:
train.columns

In [None]:
Cat_data = train[['city_development_index','training_hours',
                 'city','gender','relevent_experience',
               'enrolled_university','education_level',
              'major_discipline','experience','company_size','company_type',
              'last_new_job','target']]
def associations_example():
    cols = associations(Cat_data,nominal_columns=['city','gender','relevent_experience',
                                           'enrolled_university','education_level',
                                          'major_discipline','experience','company_size','company_type',
                                          'last_new_job','target'])
    df = pd.DataFrame(cols['corr'])
    return df
#     cm = data[cols].corr()
    
plt.rcParams["figure.figsize"]=15,10
df = associations_example()

In [None]:
# df.tail(1)
df = df.sort_values('target', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')
df[['target']]

# these help us to know how these fators has correlation with target col.
# as we will be using tree classifier so no need to worry about high correalation

In [None]:
X=train.drop(['target'],axis=1)
X.corrwith(train['target']).plot.bar(
        figsize = (10, 5), title = "Correlation with Target", fontsize = 10,
        rot = 50, grid = True)

In [None]:
# lets use decsion tree for seeing which col are determining factor
#first we need to convert categorical column into integer
cat_cols = ['city','relevent_experience','gender','relevent_experience',
           'enrolled_university','education_level',
           'major_discipline','experience','company_size',
           'company_type','last_new_job']
for i in cat_cols:
    # empty dictionary
    my_dict = {}
    u_v = pd.DataFrame(train[i].value_counts())
    u_v['index'] = u_v.index
    for p,q in enumerate(u_v.index):
        my_dict.update({q:p})        
    train[i] = train[i].replace(my_dict)

In [None]:
train = train.drop(columns='enrollee_id')
train.head()

In [None]:
X = train.drop(columns='target')
Y = train['target']
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.23, random_state=42, stratify=Y)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)



In [None]:
valuable_cols = pd.DataFrame(clf.feature_importances_)
valuable_cols['index'] = X_train.columns
valuable_cols = valuable_cols.sort_values(by=0, ascending=False)
sns.barplot(y='index',x=0,data=valuable_cols)


# This tells us feature which will act as deciding factor

 K-Prototype clustering

In [None]:
#Getting the list of Numerical and Categorical Variables
num_cols = train._get_numeric_data().columns
print (num_cols)

In [None]:
#Choosing optimal K value
# cost = []
# X = train
# for num_clusters in list(range(2,7)):
#     kproto = KPrototypes(n_clusters=num_clusters, init='Huang', random_state=42,n_jobs=-2,max_iter=15,n_init=50) 
#     kproto.fit_predict(X, categorical=[0])
#     cost.append(kproto.cost_)

# plt.plot(cost)
# plt.xlabel('K')
# plt.ylabel('cost')
# plt.show


# it came to be 2

In [None]:
# Running K-Prototype clustering
X = train
kproto = KPrototypes(n_clusters=2, init='Huang', verbose=0, random_state=42,max_iter=20, n_init=50,n_jobs=-2,gamma=.25) 
clusters = kproto.fit_predict(X, categorical=[0])

In [None]:
# Calculate Silhoutte Score
#
score = silhouette_score(X, clusters, metric='euclidean')
#
# Print the score
# 
print('Kprototype Silhouetter Score: %.3f' % score)

In [None]:
# We have segregated into two cluster lets analysze it
cluster_data = train.copy()
cluster_data['cluster'] = clusters
cluster_data_0 = cluster_data[cluster_data['cluster']==0].reset_index(drop=True)
cluster_data_1 = cluster_data[cluster_data['cluster']==1].reset_index(drop=True)

#################################################################


fig, axs = plt.subplots(nrows=4,ncols=2, figsize=(15,20))
sns.countplot(x="relevent_experience",hue='target', data=cluster_data_0, ax=axs[0,0])
sns.countplot(x="relevent_experience",hue='target', data=cluster_data_1, ax=axs[0,1])
sns.countplot(x="education_level",hue='target', data=cluster_data_0, ax=axs[1,0])
sns.countplot(x="education_level",hue='target', data=cluster_data_1, ax=axs[1,1])
sns.countplot(x="experience",hue='target', data=cluster_data_0, ax=axs[2,0])
sns.countplot(x="experience",hue='target', data=cluster_data_1, ax=axs[2,1])
sns.countplot(x="company_type",hue='target', data=cluster_data_0, ax=axs[3,0])
sns.countplot(x="company_type",hue='target', data=cluster_data_1, ax=axs[3,1])

In [None]:
feat=train.drop(['target'],axis=1)
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
pca_result = pca.fit_transform(feat.values)

In [None]:
components = pd.DataFrame(np.round(pca.components_, 6), columns = list(feat.keys()))
components

In [None]:
ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
variance_ratios.index = [1,2,3,4]
variance_ratios

# In Pca first component explain 88% of the data followed by other components

In [None]:
fig, ax = plt.subplots(nrows=2,figsize = (15,15))

#  Plot the feature weights as a function of the components
components.iloc[:,5:].plot(ax = ax[0], kind = 'bar')
components.iloc[:,:5].plot(ax = ax[1], kind = 'bar')

# we can see training_hours,city, experince, company_size are quite important which is already explained above

# **If you like my work please upvote**