# HR-Analytics Data With Hierarchical_Clusters

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

In [None]:
#Read the Data
data_train_hr = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
data_test_hr  = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
data_submission_hr = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv')

In [None]:
# Combine test and train datasets
data_hr = data_train_hr.append(data_test_hr)
data_hr

In [None]:
data_hr.head()

In [None]:
data_submission_hr.head()

# Information about Data

In [None]:
data_hr.info

In [None]:
data_hr.columns

In [None]:
data_hr.shape

In [None]:
data_hr.isnull()

In [None]:
data_hr.isnull().sum()

# Drop null columns

In [None]:
data_hr.dropna(inplace=True)

In [None]:
data_hr.isnull().sum()

In [None]:
#print type of data 
data_hr.dtypes

# Show Numeric and Non Numeric Columns 

In [None]:
print('Numeric Columns:')
# select numeric columns
df_numeric = data_hr.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
print(numeric_cols)
#select non numeric columns
print('Non Numeric Columns:')
df_non_numeric = data_hr.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
print(non_numeric_cols)


# Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
label_En = LabelEncoder()
data_hr['city']= label_En.fit_transform(data_hr['city'])
data_hr['relevent_experience'] = label_En.fit_transform(data_hr['relevent_experience'])
data_hr['enrolled_university'] = label_En.fit_transform(data_hr['enrolled_university'])
#astype(str) because encoder working when parameter is str or float
data_hr['gender'] = label_En.fit_transform(data_hr['gender'].astype(str))
data_hr['education_level'] = label_En.fit_transform(data_hr['education_level'].astype(str))
data_hr['major_discipline'] = label_En.fit_transform(data_hr['major_discipline'].astype(str))
data_hr['experience'] = label_En.fit_transform(data_hr['experience'].astype(str))
data_hr['company_size'] = label_En.fit_transform(data_hr['company_size'].astype(str))
data_hr['company_type'] = label_En.fit_transform(data_hr['company_type'].astype(str))
data_hr['last_new_job'] = label_En.fit_transform(data_hr['last_new_job'].astype(str))

In [None]:
data_hr.head()

In [None]:
#x is features , y is target
X=data_hr.drop('target' , axis=1)
Y=data_hr['target']

In [None]:
#show first 5 row from X - data 
X.head()

In [None]:
#show first 5 row from Y - data
Y.head()

# Splitting Data To Train and Test Data

In [None]:
#split data 70% for train and 30% for test
from sklearn.model_selection import train_test_split
x_train, x_test , y_train , y_test = train_test_split(X,Y,test_size=0.3 , random_state=0)

# Dendogram

In [None]:
#dendogram without fit model(Train)
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
dendrogram = sch.dendrogram(sch.linkage(x_train[1:30], method='ward'))
plt.title('Taining Set')
plt.xlabel('X Values')
plt.ylabel('Distance')
plt.show()

In [None]:
#dendogram without fit model(Test)
import matplotlib.pyplot as plt
dendrogram = sch.dendrogram(sch.linkage(x_test[1:30], method='ward'))
plt.title('Testting Set')
plt.xlabel('X Values')
plt.ylabel('Distance')
plt.show()

# Fiting Cluster Model

In [None]:
#fit model 
AggClusterModel = AgglomerativeClustering(n_clusters=2, affinity = 'euclidean' , linkage = 'ward' , )

In [None]:
#predict
y_pred_train = AggClusterModel.fit_predict(x_train)
y_pred_test  = AggClusterModel.fit_predict(x_test)

In [None]:
#print shape for x_train
x_train.shape

In [None]:
#number of row in data 
y_pred_train.shape

In [None]:
y_pred_train[2:15]

In [None]:
x_test.shape

In [None]:
y_pred_test.shape

In [None]:
y_pred_test[2:15]

In [None]:
#dendrogram for training set in range(0,60)
dendrogram = sch.dendrogram(sch.linkage(x_train[0:60] , method='ward'))
plt.title('Training Set')
plt.xlabel('X values')
plt.ylabel('Distances')
plt.show()
#2 cluster red and green

In [None]:
#dendrogram for training set in range(0,10)
dendrogram = sch.dendrogram(sch.linkage(x_train[0:10] , method='ward'))
plt.title('Training Set')
plt.xlabel('X values')
plt.ylabel('Distances')
plt.show()
#2 cluster red and green

In [None]:
#dendrogram for test set in range(1,20)
dendrogram = sch.dendrogram(sch.linkage(x_test[1:20] , method='ward'))
plt.title('Testting Set')
plt.xlabel('X Values')
plt.ylabel('Distance')
plt.show()

In [None]:
dendrogram = sch.dendrogram(sch.linkage(x_test[1:50], method='ward'))
plt.title('Testting Set')
plt.xlabel('X Values')
plt.ylabel('Distance')
plt.show()

# Draw Scatter for Train set

In [None]:
#Split to 2 cluster 0 and 1
AggClusterModel.labels_

In [None]:
y_pred_test[ :10]

In [None]:
y_test[ :10]

In [None]:
x_train.info()

In [None]:
x_train.head()

In [None]:
#draw scatter train set
plt.scatter(x_train[y_pred_train == 0].iloc[:,0], x_train[y_pred_train == 0].iloc[:,1] , s=2 , c='red' , label='Cluster1')
plt.scatter(x_train[y_pred_train == 1].iloc[:,0], x_train[y_pred_train == 1].iloc[:,1] , s=2 , c='blue' , label='Cluster2')
#plt.scatter(x_train[y_pred_train == 2].iloc[:,0], x_train[y_pred_train == 2].iloc[:,1] , s=2 , c='green' , label='Cluster3')
#plt.scatter(x_train[y_pred_train == 3].iloc[:,0], x_train[y_pred_train == 3].iloc[:,1] , s=2 , c='yellow' , label='Cluster4')
#plt.scatter(x_train[y_pred_train == 4].iloc[:,0], x_train[y_pred_train == 4].iloc[:,1] , s=2 , c='black' , label='Cluster5')
plt.title('Training Set')
plt.xlabel('X values')
plt.ylabel('Y values')
plt.legend()
plt.show()

In [None]:
#draw scatter train set
plt.scatter(x_train[y_pred_train == 0].iloc[5:30,0], x_train[y_pred_train == 0].iloc[5:30,1] , s=2 , c='red' , label='Cluster1')
plt.scatter(x_train[y_pred_train == 1].iloc[5:30,0], x_train[y_pred_train == 1].iloc[5:30,1] , s=2 , c='blue' , label='Cluster2')
#plt.scatter(x_train[y_pred_train == 2].iloc[5:30,0], x_train[y_pred_train == 2].iloc[5:30,1] , s=2 , c='green' , label='Cluster3')
#plt.scatter(x_train[y_pred_train == 3].iloc[5:30,0], x_train[y_pred_train == 3].iloc[5:30,1] , s=2 , c='yellow' , label='Cluster4')
#plt.scatter(x_train[y_pred_train == 4].iloc[5:30,0], x_train[y_pred_train == 4].iloc[5:30,1] , s=2 , c='black' , label='Cluster5')

plt.title('Training Set')
plt.xlabel('X values')
plt.ylabel('Y values')
plt.legend()
plt.show()

In [None]:
#draw scatter train set
plt.scatter(x_test[y_pred_test == 0].iloc[5:30,0], x_test[y_pred_test == 0].iloc[5:30,1] , s=2 , c='red' , label='Cluster1')
plt.scatter(x_test[y_pred_test == 1].iloc[5:30,0], x_test[y_pred_test == 1].iloc[5:30,1] , s=2 , c='blue' , label='Cluster2')
#plt.scatter(x_test[y_pred_test == 2].iloc[5:30,0], x_test[y_pred_test == 2].iloc[5:30,1] , s=2 , c='green' , label='Cluster3')
#plt.scatter(x_test[y_pred_test == 3].iloc[5:30,0], x_test[y_pred_test == 3].iloc[5:30,1] , s=2 , c='yellow' , label='Cluster4')
#plt.scatter(x_test[y_pred_test == 4].iloc[5:30,0], x_test[y_pred_test == 4].iloc[5:30,1] , s=2 , c='black' , label='Cluster5')
plt.title('Training Set')
plt.xlabel('X values')
plt.ylabel('Y values')
plt.legend()
plt.show()