In [None]:
import warnings
warnings.filterwarnings('ignore')

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import Birch, MiniBatchKMeans, DBSCAN, AffinityPropagation, KMeans
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, to_tree, inconsistent, maxRstat
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
import matplotlib.patches as patches
from scipy.cluster.hierarchy import fcluster
from tqdm import tqdm
from scipy.spatial import KDTree
from IPython.core.display import display, HTML
pd.set_option('display.max_colwidth', 1500)
from scipy.sparse import save_npz, load_npz
from sklearn.decomposition import PCA
from pymystem3 import Mystem
import re
from transliterate import translit, get_available_language_codes
import os


import time

sns.set()


%matplotlib inline
np.set_printoptions(precision=5, suppress=True)

In [None]:
all_resuses = pd.read_csv('/home/shurik2533/all_resumes.csv', index_col=0)

In [None]:
pa = all_resuses.professional_area.value_counts()
pa_names = pa.index
pa_translits = [translit(re.sub("\W+", '', v), 'ru', reversed=True) for v in pa_names]
use_skills = [v == 'Информационные технологии, интернет, телеком' for v in pa_names]

In [None]:
c_counts = {
    'Продажи': 170, 
    'Начало карьеры, студенты': 200, 
    'Административный персонал': 200,
    'Бухгалтерия, управленческий учет, финансы предприятия': 200, 
    'Производство': 200,
    'Информационные технологии, интернет, телеком': 190,
    'Транспорт, логистика': 200,
    'Строительство, недвижимость': 200, 
    'Высший менеджмент': 220,
    'Маркетинг, реклама, PR': 150, 
    'Банки, инвестиции, лизинг': 220,
    'Управление персоналом, тренинги': 220, 
    'Юристы': 180,
    'Туризм, гостиницы, рестораны': 150, 
    'Рабочий персонал': 200,
    'Искусство, развлечения, масс-медиа': 180, 
    'Безопасность': 180,
    'Медицина, фармацевтика': 190, 
    'Наука, образование': 220, 
    'Автомобильный бизнес': 200,
    'Закупки': 240, 
    'Добыча сырья': 200, 
    'Спортивные клубы, фитнес, салоны красоты': 200,
    'Государственная служба, некоммерческие организации': 180,
    'Консультирование': 180, 
    'Домашний персонал': 150, 
    'Страхование': 160,
    'Инсталляция и сервис': 200
}

In [None]:
mystem = Mystem()
def tokenize(text):
    return [v for v in mystem.lemmatize(re.sub("[^\w]", " ",  text.lower())) if len(v.strip()) > 1]

In [None]:
style = """<style>
table.dataframe {{
font-family: Arial, Helvetica, sans-serif;
border: 1px solid #FFFFFF;
text-align: center;
border-collapse: collapse;
}}
table.dataframe td, table.dataframe th {{
border: 1px solid #FFFFFF;
font-size: 12px;
padding: 1px 1px;
}}
table.dataframe tbody td {{
font-size: 12px;
}}
table.dataframe tr:nth-child(even) {{
background: #D0E4F5;
}}
table.dataframe thead {{
background: #0B6FA4;
border-bottom: 3px solid #FFFFFF;
}}
table.dataframe thead th {{
font-size: 12px;
color: #FFFFFF;
text-align: center;
border-left: 0px solid #FFFFFF;
}}
table.dataframe thead th:first-child {{
border-left: none;
}}

table.main {{
font-family: Arial, Helvetica, sans-serif;
border: 1px solid #FFFFFF;
border-collapse: collapse;
}}
table.main td, table.main th {{
border: 1px solid #FFFFFF;
font-size: 12px;
padding: 1px 1px;
}}
</style>
"""

body = '<html><head><meta charset="utf-8">' + style + '</head><body>{}</body></html>'
table = '<table class="main" style="text-align: left;"><tr><td style="width:170px; display: block;" valign="top">{}</td><td valign="top">{}</td></tr></table>'
div = '<div style="padding-bottom: 5px;">{}. ({})</div>'
f_pattern = "class_{}_{}.html"
a_cls = '<a href="{}">{}</a>'
cl_name = 'Class {}'
report_cols = ['position','professional_area','spec_name','skills','compensation','work_experience_month_count','area_id','gender','birth_day','work_schedule','education_level']
header = '<table><tr><td>{}</td></tr><tr><td style="font-size:150%">{}</td></tr></table>'
img = '<img src="i/class_{}_{}.png">'
rep_size = 100

pd.set_option('display.max_colwidth', 400)

In [None]:
%%time
start_time = time.time()
for pa_name, pa_translit, use_skill in zip(pa_names[-10:], pa_translits[-10:], use_skills[-10:]):
    info = pd.read_csv('/home/shurik2533/res_X/for_vis_{}.csv'.format(pa_translit), index_col=0)
    data = pd.read_csv('/home/shurik2533/res_X/X_{}.csv'.format(pa_translit), index_col=0)
    resumes_sample = pd.read_csv('/home/shurik2533/res_X/resumes_sample_{}.csv'.format(pa_translit), index_col=0)
    res = all_resuses.loc[data.index]
    print (pa_name, info.shape, data.shape, res.shape, resumes_sample.shape)
    
    if data.shape[0] > 100000:
        print('Start MiniBatchKMeans')
        km = MiniBatchKMeans(n_clusters=70000, init='k-means++', random_state=42, init_size=140000, reassignment_ratio=0.1, batch_size=50, n_init=8)
        predicted = km.fit_predict(data)
        cluster_centers = km.cluster_centers_
        print('Finish MiniBatchKMeans')
    else:
        predicted = np.array(range(data.shape[0]))
        cluster_centers = data
        
    prd = pd.DataFrame(predicted, index=data.index)
        
    print('Start Z')
    start_time = time.time()
    Z = linkage(cluster_centers, 'ward')
    print("--- %s minutes ---" % ((time.time()/60) - (start_time/60)))
    pd.DataFrame(Z).to_csv('/home/shurik2533/res_X/Z_{}.csv'.format(pa_translit))
    print('Finish Z')
#     Z = pd.read_csv('/home/shurik2533/res_X/Z_{}.csv'.format(pa_translit))[['0', '1', '2', '3']].values
    if data.shape[0] <= 30000:
        c, coph_dists = cophenet(Z, pdist(cluster_centers))
        print ('C', c)
        
    plt.ioff()
    plt.figure(figsize=(16, 6))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
        Z,
        leaf_rotation=90.,
        leaf_font_size=8.,
        truncate_mode='level',
        p=16,
        color_threshold=5,
        no_labels=True
    )
    plt.savefig('/home/shurik2533/res_X/dendrograms/{}'.format(pa_translit), bbox_inches='tight')
    plt.close()
    
    last = Z[-800:, 2]
    last_rev = last[::-1]
    idxs = np.arange(1, len(last) + 1)
    plt.figure(figsize=(16, 6))
    plt.plot(idxs, last_rev, label='clusters')

    acceleration = np.diff(last, 2)  # 2nd derivative of the distances
    acceleration_rev = acceleration[::-1]
    plt.ioff()
    plt.plot(idxs[:-2] + 1, acceleration_rev, label='acceleration')
    plt.title(pa_name)
    plt.xlabel('clusters')
    plt.ylabel('distance')
    plt.legend(loc='upper right')
    plt.savefig('/home/shurik2533/res_X/acceleration/{}'.format(pa_translit), bbox_inches='tight')
    plt.close()
    k = acceleration_rev.argmax() + 2  # if idx 0 is the max of this we want 2 clusters
    
    predicted_info = prd.loc[info.index][0].values
    
    k=c_counts[pa_name]
    clusters = fcluster(Z, k, criterion='maxclust')
    plt.ioff()
    plt.figure(figsize=(15, 15))
    cl = [clusters[p] for p in predicted_info]
    plt.scatter(info.x, info.y, c=cl, cmap='prism', s=1.5)  # plot points with cluster dependent colors
    plt.savefig('/home/shurik2533/res_X/clusters/{}'.format(pa_translit), bbox_inches='tight')
    plt.close()
    
    
    clusters_counts = pd.DataFrame(clusters)[0].value_counts()
    for i in tqdm(range(k)):
        i = i+1
        classes_list = ''
        for j in clusters_counts.keys():
            if i == j:
                cls = cl_name.format(j)
            else:
                cls = a_cls.format(f_pattern.format(k, j), cl_name.format(j))
            classes_list += div.format(cls, clusters_counts.loc[j])

        cl = [1 if clusters[p] == i else 0 for p in predicted_info]
        cl_full = [1 if clusters[p] == i else 0 for p in predicted]

        data_for_group = data[(pd.DataFrame(cl_full)[0] == 1).values]
        cluster_count = data_for_group.shape[0]
        
        resumes_for_group = res[(pd.DataFrame(cl_full)[0] == 1).values][report_cols]
        resumes_for_group_size = resumes_for_group.shape[0]
        if resumes_for_group_size > 0:
            cluster_name = resumes_for_group.position.value_counts().index[0]
            top_position = pd.DataFrame([word for words in resumes_for_group.position.apply(tokenize).values for word in words])[0].value_counts()[0:5].index.values
            if use_skill:
                top_skills = pd.DataFrame([skill for skills in resumes_for_group.skills.apply(lambda x: [x.strip() for x in x.split(',')]).values for skill in skills])[0].value_counts()[0:15].index.values
            else:
                top_skills = '-'
            resumes_html = resumes_for_group.sample(resumes_for_group_size if resumes_for_group_size < rep_size else rep_size).to_html()
        else:
            resumes_html = 'Мало данных'
            top_position = ''
            top_skills = ''
            cluster_name = ''

        if i == 1:
            back = 'Class -'
        else:
            back = a_cls.format(f_pattern.format(k, i-1), cl_name.format(i-1))

        if i == k:
            forward = 'Class -'
        else:
            forward = a_cls.format(f_pattern.format(k, i+1), cl_name.format(i+1))
        cluster_info = '{} ({})'.format(cluster_name, ', '.join(top_position))
        top_skills_str = '<div style="font-size:140%; padding-bottop: 5px;"><b>Top skills</b>: {}</div>'.format(', '.join(top_skills))
        classes_info = header.format(img.format(k, i), '<div>&nbsp;&lt;{}&nbsp;&nbsp;{}&nbsp;&nbsp;{}&gt;&nbsp;<br>{}. Count: {}</div>'.format(back, cl_name.format(i), forward, cluster_info, cluster_count)) + top_skills_str + resumes_html
        content = table.format(classes_list, classes_info)
        if not os.path.exists('/home/shurik2533/res_X/reports/reports_{}/i'.format(re.sub("'", "", pa_translit))):
            os.makedirs('/home/shurik2533/res_X/reports/reports_{}/i'.format(re.sub("'", "", pa_translit)))
    
        with open('/home/shurik2533/res_X/reports/reports_{}/'.format(re.sub("'", "", pa_translit)) + f_pattern.format(k, i), "w+") as f:
            f.write(body.format(content))

        plt.ioff()
        plt.figure(figsize=(6, 6))
        plt.title('class_{}_{}'.format(k, i))
        plt.scatter(info.x, info.y, c=cl, cmap='prism_r', s=0.5)
        plt.savefig('/home/shurik2533/res_X/reports/reports_{}/i/class_{}_{}'.format(re.sub("'", "", pa_translit), k, i), bbox_inches='tight')
        plt.close()
    

In [None]:
htm = '<html><head><meta charset="utf-8"></head><body>{}</body></html>'

cont = ''
for pa_name, pa_translit, use_skill in zip(pa_names, pa_translits, use_skills):
    cont = cont + '<div style="font-size:160%"><a href="reports_{}/class_{}_1.html">{}</a></div>'.format(re.sub("'", "", pa_translit), c_counts[pa_name], pa_name)
    
with open('/home/shurik2533/res_X/reports/index.html', "w+") as f:
            f.write(htm.format(cont))