In [1]:
# –í—Å–µ –±–∏–±–ª–∏–æ—Ç–µ–∫–∏

import pandas as pd
from scipy.stats import ttest_ind, levene, mannwhitneyu,shapiro, \
    f_oneway, kruskal, chi2_contingency, fisher_exact
import numpy as np
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan
import seaborn as sns
from sklearn.metrics import roc_auc_score, roc_curve
import pingouin as pg
import warnings

warnings.filterwarnings('ignore')

In [2]:
income = pd.read_csv('–∑–∞—Ä–∞–±–æ—Ç–æ–∫ –±–æ–±—Ä–æ–≤.csv', encoding='utf-8', sep=';')
purs = pd.read_csv('–ø–æ–∫—É–ø–∫–∏.csv', encoding='utf-8', sep=';')
staff = pd.read_excel('—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∏.xlsx')

In [3]:
def obr(df):
    df.columns = df.columns.str.replace(' ', '_').str.lower().str.replace('.', '')
    return df

In [4]:
income = obr(income)
purs = obr(purs)
staff = obr(staff)

In [5]:
missions_to_drop = [
    '—Ç—Ä–µ–Ω–µ—Ä',
    'Croc sport',
    'Croc sport( 4 —É—Ä–æ–≤–µ–Ω—å)',
    '–º–∏—Å—Å–∏—è best hunter',
    '–º–∏—Å—Å–∏—è –∫—Ä–µ–∞—Ç–∏–≤–Ω—ã–π –∫–ª–∞—Å—Å',
    '–ø—É—Ç–µ—à–µ—Å—Ç–≤–µ–Ω–Ω–∏–∫ –ö–†–û–ö',
    '—Ä–∞–∑–≤–∏—Ç–∏–µ –±—Ä–µ–Ω–¥–∞ —Ä–∞–±–æ—Ç–æ–¥–∞—Ç–µ–ª—è',
    '–ø—Ä–æ–¥–≤–∏–∂–µ–Ω–∏–µ IT-–ø—Ä–æ—Ñ–µ—Å—Å–∏–π'
]

comments_to_drop = ['–æ—à–∏–±–æ—á–Ω–æ –Ω–∞—á–∏—Å–ª–µ–Ω—ã']

income = income[~income['–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏'].str.lower().isin([m.lower() for m in missions_to_drop])]
income = income[~income['–∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π'].str.lower().isin([m.lower() for m in comments_to_drop])]

In [6]:
income['–¥–∞—Ç–∞'] = pd.to_datetime(income['–¥–∞—Ç–∞'], format='%d.%m.%Y')
today = pd.Timestamp.today().normalize()
income['–¥–Ω–µ–π_–Ω–∞–∑–∞–¥'] = (today - income['–¥–∞—Ç–∞']).dt.days

In [7]:
purs['–¥–∞—Ç–∞_–æ—Ñ–æ—Ä–º–ª–µ–Ω–∏—è_–∑–∞–∫–∞–∑–∞'] = pd.to_datetime(purs['–¥–∞—Ç–∞_–æ—Ñ–æ—Ä–º–ª–µ–Ω–∏—è_–∑–∞–∫–∞–∑–∞'], format='%d.%m.%Y %H:%M')
today = pd.Timestamp.today().normalize()
purs['–¥–Ω–µ–π_–Ω–∞–∑–∞–¥'] = (today - purs['–¥–∞—Ç–∞_–æ—Ñ–æ—Ä–º–ª–µ–Ω–∏—è_–∑–∞–∫–∞–∑–∞'].dt.normalize()).dt.days

In [8]:
import re


def parse_stazh_to_days(text):
    years = months = days = 0

    year_match = re.search(r'(\d+)\s*–≥\.', text)
    if year_match:
        years = int(year_match.group(1))

    month_match = re.search(r'(\d+)\s*–º–µ—Å\.', text)
    if month_match:
        months = int(month_match.group(1))

    day_match = re.search(r'(\d+)\s*–¥–Ω\.', text)
    if day_match:
        days = int(day_match.group(1))

    total_days = years * 360 + months * 30 + days
    return total_days

staff['—Å—Ç–∞–∂_–≤_–¥–Ω—è—Ö'] = staff['—Å—Ç–∞–∂_—Ñ–∞–∫—Ç–∏—á–µ—Å–∫–∏–π_–ø–æ_–∫–æ–º–ø–∞–Ω–∏–∏'].apply(parse_stazh_to_days)

In [9]:
staff['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'] = staff['–≤–Ω–µ—à–Ω–∏–π_–∫–æ–¥']
purs = purs.merge(
    staff[['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞', '—Å—Ç–∞–∂_–≤_–¥–Ω—è—Ö']],
    on='–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞',
    how='left'
)
purs['—Å—Ç–∞–∂_–Ω–∞_–º–æ–º–µ–Ω—Ç_–ø–æ–∫—É–ø–∫–∏'] = purs['—Å—Ç–∞–∂_–≤_–¥–Ω—è—Ö'] - purs['–¥–Ω–µ–π_–Ω–∞–∑–∞–¥']


income = income.merge(
    staff[['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞', '—Å—Ç–∞–∂_–≤_–¥–Ω—è—Ö']],
    on='–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞',
    how='left'
)
income['—Å—Ç–∞–∂_–Ω–∞_–º–æ–º–µ–Ω—Ç_–∑–∞—Ä–∞–±–æ—Ç–∫–∞'] = income['—Å—Ç–∞–∂_–≤_–¥–Ω—è—Ö'] - income['–¥–Ω–µ–π_–Ω–∞–∑–∞–¥']

In [10]:
employees_with_missions = income['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'].unique()
employees_with_purchases = purs['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'].unique()

all_employees = set(employees_with_missions) | set(employees_with_purchases)
percent_with_missions = len(employees_with_missions) / len(all_employees) * 100
percent_with_purchases = len(employees_with_purchases) / len(all_employees) * 100

print(f"–ü—Ä–æ—Ü–µ–Ω—Ç –ª—é–¥–µ–π —Å —Ö–æ—Ç—è –±—ã –æ–¥–Ω–æ–π –º–∏—Å—Å–∏–µ–π: {percent_with_missions:.2f}%")
print(f"–ü—Ä–æ—Ü–µ–Ω—Ç –ª—é–¥–µ–π —Å —Ö–æ—Ç—è –±—ã –æ–¥–Ω–æ–π –ø–æ–∫—É–ø–∫–æ–π: {percent_with_purchases:.2f}%")

–ü—Ä–æ—Ü–µ–Ω—Ç –ª—é–¥–µ–π —Å —Ö–æ—Ç—è –±—ã –æ–¥–Ω–æ–π –º–∏—Å—Å–∏–µ–π: 73.90%
–ü—Ä–æ—Ü–µ–Ω—Ç –ª—é–¥–µ–π —Å —Ö–æ—Ç—è –±—ã –æ–¥–Ω–æ–π –ø–æ–∫—É–ø–∫–æ–π: 92.61%


In [11]:
purs['—Å—É–º–º–∞'] = purs['—Å—Ç–æ–∏–º–æ—Å—Ç—å_–≤_–≤–∞–ª—é—Ç–µ'] * purs['–∫–æ–ª–∏—á–µ—Å—Ç–≤–æ'] * (1 - purs['—Å–∫–∏–¥–∫–∞']/100)
total_spent = purs['—Å—É–º–º–∞'].sum()
unique_buyers = purs['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'].nunique()
avg_spent_per_person = total_spent / unique_buyers

print(f"–°—Ä–µ–¥–Ω—è—è —Å—É–º–º–∞, –ø–æ—Ç—Ä–∞—á–µ–Ω–Ω–∞—è –æ–¥–Ω–∏–º —Å–æ—Ç—Ä—É–¥–Ω–∏–∫–æ–º: {avg_spent_per_person:.2f}")

–°—Ä–µ–¥–Ω—è—è —Å—É–º–º–∞, –ø–æ—Ç—Ä–∞—á–µ–Ω–Ω–∞—è –æ–¥–Ω–∏–º —Å–æ—Ç—Ä—É–¥–Ω–∏–∫–æ–º: 8116.20


In [12]:
good = [
'–¢—ã –≤–æ–≤—Ä–µ–º—è —Å–ø–∏—Å–∞–ª –∑–∞—Ç—Ä–∞—á–µ–Ω–Ω–æ–µ –≤—Ä–µ–º—è –Ω–∞ –∑–∞–¥–∞—á–∏ –∏ –ø—Ä–æ–µ–∫—Ç—ã',
'–¢—ã –≤–æ–≤—Ä–µ–º—è –ø–æ–¥–ø–∏—Å–∞–ª –¥–æ–∫—É–º–µ–Ω—Ç—ã –ø–æ –∫–∞–¥—Ä–æ–≤—ã–º –¥–≤–∏–∂–µ–Ω–∏—è–º',
'–ö—Ä–µ–∞—Ç–∏–≤–Ω—ã–π –∫–ª–∞—Å—Å',
'–¢—ã –ø—Ä–æ–≤–µ—Ä–∏–ª –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –ø—Ä–∏ –æ–±–Ω–æ–≤–ª–µ–Ω–∏–∏ –ø—Ä–æ–µ–∫—Ç–Ω–æ–≥–æ –æ–ø—ã—Ç–∞ (–¥–ª—è —é—Ä–∏—Å—Ç–æ–≤, –∑–∞ 1 –ø—Ä–æ–µ–∫—Ç)',
'–¢—ã –ø—Ä–æ—à–µ–ª –≤—Å–µ –æ–Ω–ª–∞–π–Ω-–∫—É—Ä—Å—ã –¥–ª—è –Ω–æ–≤—ã—Ö —Å–æ—Ç—Ä—É–¥–Ω–∏–∫–æ–≤',
'–¢—ã –ø—Ä–æ–≤–µ—Ä–∏–ª –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –ø—Ä–∏ —Å–±–æ—Ä–µ –ø—Ä–æ–µ–∫—Ç–Ω–æ–≥–æ –æ–ø—ã—Ç–∞ (–¥–ª—è –º–∞—Ä–∫–µ—Ç–æ–ª–æ–≥–æ–≤, —Å–ø–µ—Ü–∏–∞–ª–∏—Å—Ç–æ–≤ —Å–ª—É–∂–±—ã –∫–∞—á–µ—Å—Ç–≤–∞, –∑–∞ 1 –ø—Ä–æ–µ–∫—Ç)',
'–¢—ã –∑–∞–ø–æ–ª–Ω–∏–ª –ø—Ä–æ–µ–∫—Ç–Ω—ã–π –æ–ø—ã—Ç (–¥–ª—è –º–µ–Ω–µ–¥–∂–µ—Ä–æ–≤ –ø—Ä–æ–µ–∫—Ç–∞, –∑–∞ 1 –ø—Ä–æ–µ–∫—Ç)',
'–¢—ã –ø–æ–≤—Ç–æ—Ä–Ω–æ –ø—Ä–æ–≤–µ—Ä–∏–ª –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –ø—Ä–∏ –æ–±–Ω–æ–≤–ª–µ–Ω–∏–∏ –ø—Ä–æ–µ–∫—Ç–Ω–æ–≥–æ –æ–ø—ã—Ç–∞ (–¥–ª—è —é—Ä–∏—Å—Ç–æ–≤, –∑–∞ 1 –ø—Ä–æ–µ–∫—Ç)',
'–¢—ã –ø–æ–≤—Ç–æ—Ä–Ω–æ –ø—Ä–æ–≤–µ—Ä–∏–ª –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –ø—Ä–∏ –æ–±–Ω–æ–≤–ª–µ–Ω–∏–∏ –ø—Ä–æ–µ–∫—Ç–Ω–æ–≥–æ –æ–ø—ã—Ç–∞ (–¥–ª—è –º–∞—Ä–∫–µ—Ç–æ–ª–æ–≥–æ–≤, —Å–ø–µ—Ü–∏–∞–ª–∏—Å—Ç–æ–≤ —Å–ª—É–∂–±—ã –∫–∞—á–µ—Å—Ç–≤–∞, –∑–∞ 1 –ø—Ä–æ–µ–∫—Ç)',
'–ú–∏—Å—Å–∏—è –ö—Ä–µ–∞—Ç–∏–≤–Ω—ã–π –∫–ª–∞—Å—Å',
'–¢—ã –æ–±–Ω–æ–≤–∏–ª –ø–æ–ª–Ω–æ—Å—Ç—å—é –ø—Ä–æ–µ–∫—Ç–Ω—ã–π –æ–ø—ã—Ç (–¥–ª—è –≤ –º–µ–Ω–µ–¥–∂–µ—Ä–∞ –ø—Ä–æ–µ–∫—Ç–∞, –∑–∞ 1 –ø—Ä–æ–µ–∫—Ç)',
'–¢—ã —É—Å–ø–µ—à–Ω–æ —Å–¥–∞–ª —ç–∫–∑–∞–º–µ–Ω*',
'–¢—ã –ø—Ä–æ—à–µ–ª –ø—Ä–æ–≥—Ä–∞–º–º—É –∞–¥–∞–ø—Ç–∞—Ü–∏–∏ –¥–ª—è –∞–Ω–∞–ª–∏—Ç–∏–∫–æ–≤',
'–¢—ã —Å–¥–∞–ª —ç–∫–∑–∞–º–µ–Ω* –Ω–∞ 100%'
]

In [13]:
from scipy.stats import wilcoxon


def good_count(missions):
    return sum(mission in good for mission in missions)


def effectivnost(missions):
    total = len(missions)
    if total == 0:
        return 0.0
    return good_count(missions) / total


def sravnenie(df, missions):
    df['–¥–∞—Ç–∞'] = pd.to_datetime(df['–¥–∞—Ç–∞'])

    results = []
    for employee_id, group in df[['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞', '–¥–∞—Ç–∞', '–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏']].groupby('–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'):
        group = group.sort_values('–¥–∞—Ç–∞')
        top = group[group['–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏'].str.contains('|'.join(missions), na=False)]
        if len(top) == 0:
            continue

        if not top.empty:
            top_data = top['–¥–∞—Ç–∞'].iloc[0]
        else:
            top_data = pd.Timestamp.max

        missions_before = group[group['–¥–∞—Ç–∞'] < top_data]['–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏']
        missions_after = group[group['–¥–∞—Ç–∞'] >= top_data]['–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏']

        start_date = pd.to_datetime("2023-09-02")
        before_diff = (top_data - start_date).days

        end_date = pd.to_datetime("2024-10-15")
        after_diff = (end_date - top_data).days

        k_before = len(missions_before)/before_diff
        k_after = len(missions_after)/after_diff

        good_k_before = good_count(missions_before)/before_diff
        good_k_after = good_count(missions_after)/after_diff

        eff_before = effectivnost(missions_before)
        eff_after = effectivnost(missions_after)

        results.append({
            '–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞': employee_id,
            '—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å_–¥–æ': eff_before,
            '—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å_–ø–æ—Å–ª–µ': eff_after,
            'k_before': k_before,
            'k_after': k_after,
            'good_k_before': good_k_before,
            'good_k_after': good_k_after,
            'missions_before': missions_before.shape[0],
            'missions_after': missions_after.shape[0]
        })

    eff_df = pd.DataFrame(results)

    valid_eff = eff_df.dropna(subset=['—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å_–¥–æ', '—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å_–ø–æ—Å–ª–µ'])

    stat, p_value = wilcoxon(valid_eff['—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å_–¥–æ'], valid_eff['—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å_–ø–æ—Å–ª–µ'])
    stat2, p_value2 = wilcoxon(valid_eff['k_before'], valid_eff['k_after'])
    stat3, p_value3 = wilcoxon(valid_eff['good_k_before'], valid_eff['good_k_after'])

    print(f"{missions} :\n–ü–†–û–¶–ï–ù–¢–´\nWilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = {stat}, p-value = {p_value:.5f}")

    if p_value < 0.05:
        print("üìà –ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é –î–û –∏ –ü–û–°–õ–ï –∫–ª—é—á–µ–≤—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π (p < 0.05)")
    else:
        print("üìâ –°—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–æ–π —Ä–∞–∑–Ω–∏—Ü—ã –Ω–µ –Ω–∞–π–¥–µ–Ω–æ (p ‚â• 0.05)")


    print()
    print(f"–¥–æ : {eff_df['—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å_–¥–æ'].median()}")
    print(f"–ø–æ—Å–ª–µ : {eff_df['—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å_–ø–æ—Å–ª–µ'].median()}")
    print()

    print(f"–ö–û–õ-–í–û –ó–ê–î–ê–ß –í –î–ï–ù–¨\nWilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = {stat2}, p-value = {p_value2:.5f}")

    if p_value2 < 0.05:
        print("üìà –ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é –î–û –∏ –ü–û–°–õ–ï –∫–ª—é—á–µ–≤—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π (p < 0.05)")
    else:
        print("üìâ –°—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–æ–π —Ä–∞–∑–Ω–∏—Ü—ã –Ω–µ –Ω–∞–π–¥–µ–Ω–æ (p ‚â• 0.05)")

    print()
    print(f"–¥–æ : {eff_df['k_before'].median()}")
    print(f"–ø–æ—Å–ª–µ : {eff_df['k_after'].median()}")
    print()

    print(f"–ö–û–õ-–í–û –ö–†–£–¢–´–• –ó–ê–î–ê–ß –í –î–ï–ù–¨\nWilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = {stat3}, p-value = {p_value3:.5f}")

    if p_value3 < 0.05:
        print("–ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é –î–û –∏ –ü–û–°–õ–ï –∫–ª—é—á–µ–≤—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π (p < 0.05)")
    else:
        print("–°—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–æ–π —Ä–∞–∑–Ω–∏—Ü—ã –Ω–µ –Ω–∞–π–¥–µ–Ω–æ (p ‚â• 0.05)")

    print()
    print(f"–¥–æ : {eff_df['good_k_before'].median()}")
    print(f"–ø–æ—Å–ª–µ : {eff_df['good_k_after'].median()}")

    print()
    print(f"–º–µ–¥–∏–∞–Ω–Ω–æ–µ –∫–æ–ª-–≤–æ –º–∏—Å—Å–∏–π –¥–æ: {eff_df['missions_before'].median()}")
    print(f"–º–µ–¥–∏–∞–Ω–Ω–æ–µ –∫–æ–ª-–≤–æ –º–∏—Å—Å–∏–π –ø–æ—Å–ª–µ: {eff_df['missions_after'].median()}")


TheBest = ['HiPro', 'HiPo', '–ö—Ä–µ–∞—Ç–∏–≤–Ω—ã–π –∫–ª–∞—Å—Å', '–ó–≤–µ–∑–¥–∞ –¥–µ–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç–∞']
sravnenie(income, TheBest)

['HiPro', 'HiPo', '–ö—Ä–µ–∞—Ç–∏–≤–Ω—ã–π –∫–ª–∞—Å—Å', '–ó–≤–µ–∑–¥–∞ –¥–µ–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç–∞'] :
–ü–†–û–¶–ï–ù–¢–´
Wilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = 247780.5, p-value = 0.00048
üìà –ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é –î–û –∏ –ü–û–°–õ–ï –∫–ª—é—á–µ–≤—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π (p < 0.05)

–¥–æ : 0.2
–ø–æ—Å–ª–µ : 0.3333333333333333

–ö–û–õ-–í–û –ó–ê–î–ê–ß –í –î–ï–ù–¨
Wilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = 26403.0, p-value = 0.00000
üìà –ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é –î–û –∏ –ü–û–°–õ–ï –∫–ª—é—á–µ–≤—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π (p < 0.05)

–¥–æ : 0.024096385542168676
–ø–æ—Å–ª–µ : 0.07920792079207921

–ö–û–õ-–í–û –ö–†–£–¢–´–• –ó–ê–î–ê–ß –í –î–ï–ù–¨
Wilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = 47683.0, p-value = 0.00000
–ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é –î–û –∏ –ü–û–°–õ–ï –∫–ª—é—á–µ–≤—ã—Ö –∑

In [14]:
from scipy.stats import wilcoxon

def sravnenie_do_i_posle(income, purs, the_best):
    income['–¥–∞—Ç–∞'] = pd.to_datetime(income['–¥–∞—Ç–∞'])
    purs['–¥–∞—Ç–∞_–æ—Ñ–æ—Ä–º–ª–µ–Ω–∏—è_–∑–∞–∫–∞–∑–∞'] = pd.to_datetime(purs['–¥–∞—Ç–∞_–æ—Ñ–æ—Ä–º–ª–µ–Ω–∏—è_–∑–∞–∫–∞–∑–∞'])

    result = []

    for employee_id, group in income.groupby('–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'):
        group = group.sort_values('–¥–∞—Ç–∞')
        top_missions = group[group['–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏'].str.contains('|'.join(the_best), na=False)]
        if top_missions.empty:
            continue

        top_date = top_missions['–¥–∞—Ç–∞'].iloc[0]

        before_missions = group[(group['–¥–∞—Ç–∞'] < top_date) & ~group['–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏'].isin(the_best)]
        after_missions = group[(group['–¥–∞—Ç–∞'] >= top_date) & ~group['–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏'].isin(the_best)]

        def count_purchases(mission_dates):
            count = 0
            total = len(mission_dates)
            for date in mission_dates:
                purchases = purs[
                    (purs['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'] == employee_id) &
                    (purs['–¥–∞—Ç–∞_–æ—Ñ–æ—Ä–º–ª–µ–Ω–∏—è_–∑–∞–∫–∞–∑–∞'] > date) &
                    (purs['–¥–∞—Ç–∞_–æ—Ñ–æ—Ä–º–ª–µ–Ω–∏—è_–∑–∞–∫–∞–∑–∞'] <= date + pd.Timedelta(days=7))
                ]
                if not purchases.empty:
                    count += 1
            return count, total

        before_count, before_total = count_purchases(before_missions['–¥–∞—Ç–∞'])
        after_count, after_total = count_purchases(after_missions['–¥–∞—Ç–∞'])

        if before_total > 0 and after_total > 0:
            result.append({
                '–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞': employee_id,
                '–¥–æ–ª—è_–¥–æ': before_count / before_total,
                '–¥–æ–ª—è_–ø–æ—Å–ª–µ': after_count / after_total
            })

    result_df = pd.DataFrame(result)
    stat, p_value = wilcoxon(result_df['–¥–æ–ª—è_–¥–æ'], result_df['–¥–æ–ª—è_–ø–æ—Å–ª–µ'])

    print("Wilcoxon test –ø–æ –¥–æ–ª—è–º –º–∏—Å—Å–∏–π —Å –ø–æ–∫—É–ø–∫–∞–º–∏:")
    print(f"—Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = {stat}, p-value = {p_value:.5f}")
    if p_value < 0.05:
        print("–ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ (p < 0.05)")
    else:
        print("–ù–µ—Ç —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–æ–π —Ä–∞–∑–Ω–∏—Ü—ã (p ‚â• 0.05)")

    print()
    print(f"–ú–µ–¥–∏–∞–Ω–Ω–∞—è –¥–æ–ª—è –ø–æ–∫—É–ø–æ–∫ –î–û: {result_df['–¥–æ–ª—è_–¥–æ'].median():.3f}")
    print(f"–ú–µ–¥–∏–∞–Ω–Ω–∞—è –¥–æ–ª—è –ø–æ–∫—É–ø–æ–∫ –ü–û–°–õ–ï: {result_df['–¥–æ–ª—è_–ø–æ—Å–ª–µ'].median():.3f}")


sravnenie_do_i_posle(income, purs, TheBest)

Wilcoxon test –ø–æ –¥–æ–ª—è–º –º–∏—Å—Å–∏–π —Å –ø–æ–∫—É–ø–∫–∞–º–∏:
—Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = 80056.5, p-value = 0.00000
–ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ (p < 0.05)

–ú–µ–¥–∏–∞–Ω–Ω–∞—è –¥–æ–ª—è –ø–æ–∫—É–ø–æ–∫ –î–û: 0.000
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –¥–æ–ª—è –ø–æ–∫—É–ø–æ–∫ –ü–û–°–õ–ï: 0.083


In [15]:
for i in staff['—è–≤–ª—è–µ—Ç—Å—è_—Ä–º/—Ç–ª'].value_counts().head(3).reset_index()['—è–≤–ª—è–µ—Ç—Å—è_—Ä–º/—Ç–ª'].unique():
    print(i)
    sravnenie(income[income['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'].isin(staff[staff['—è–≤–ª—è–µ—Ç—Å—è_—Ä–º/—Ç–ª'] == i]['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'].tolist())], TheBest)

–Ω–µ—Ç
['HiPro', 'HiPo', '–ö—Ä–µ–∞—Ç–∏–≤–Ω—ã–π –∫–ª–∞—Å—Å', '–ó–≤–µ–∑–¥–∞ –¥–µ–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç–∞'] :
–ü–†–û–¶–ï–ù–¢–´
Wilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = 167023.0, p-value = 0.01303
üìà –ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é –î–û –∏ –ü–û–°–õ–ï –∫–ª—é—á–µ–≤—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π (p < 0.05)

–¥–æ : 0.23904761904761904
–ø–æ—Å–ª–µ : 0.3333333333333333

–ö–û–õ-–í–û –ó–ê–î–ê–ß –í –î–ï–ù–¨
Wilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = 16941.5, p-value = 0.00000
üìà –ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é –î–û –∏ –ü–û–°–õ–ï –∫–ª—é—á–µ–≤—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π (p < 0.05)

–¥–æ : 0.027399111217871724
–ø–æ—Å–ª–µ : 0.0850531914893617

–ö–û–õ-–í–û –ö–†–£–¢–´–• –ó–ê–î–ê–ß –í –î–ï–ù–¨
Wilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = 32014.0, p-value = 0.00000
–ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç—å—é –î–û –∏ –ü–û–°–õ

In [24]:
def avg_top_date(df, missions):
    df['–¥–∞—Ç–∞'] = pd.to_datetime(df['–¥–∞—Ç–∞'])

    top_dates = []
    for employee_id, group in df[['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞', '–¥–∞—Ç–∞', '–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏']].groupby('–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'):
        group = group.sort_values('–¥–∞—Ç–∞')
        top = group[group['–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏'].str.contains('|'.join(missions), na=False)]
        if not top.empty:
            top_dates.append(top['–¥–∞—Ç–∞'].iloc[0])

    avg_date = pd.to_datetime(top_dates).mean()
    print(f"üìÖ –°—Ä–µ–¥–Ω—è—è –¥–∞—Ç–∞ –ø–æ–ª—É—á–µ–Ω–∏—è –ø–µ—Ä–≤–æ–π –º–∏—Å—Å–∏–∏ –∏–∑ TheBest: {avg_date.date()}")
    return avg_date


avg_top_date(income, TheBest)


üìÖ –°—Ä–µ–¥–Ω—è—è –¥–∞—Ç–∞ –ø–æ–ª—É—á–µ–Ω–∏—è –ø–µ—Ä–≤–æ–π –º–∏—Å—Å–∏–∏ –∏–∑ TheBest: 2024-04-24


Timestamp('2024-04-24 13:42:41.702127616')

In [23]:
from scipy.stats import wilcoxon

def temporal_sravnenie(df):
    df['–¥–∞—Ç–∞'] = pd.to_datetime(df['–¥–∞—Ç–∞'])
    median_date = pd.to_datetime('2024-04-24')

    results = []
    for employee_id, group in df[['–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞', '–¥–∞—Ç–∞', '–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏']].groupby('–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞'):
        group = group.sort_values('–¥–∞—Ç–∞')

        before = group[group['–¥–∞—Ç–∞'] < median_date]
        after = group[group['–¥–∞—Ç–∞'] >= median_date]

        before_days = (median_date - before['–¥–∞—Ç–∞'].min()).days if not before.empty else 1
        after_days = (after['–¥–∞—Ç–∞'].max() - median_date).days if not after.empty else 1

        good_k_before = good_count(before['–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏']) / before_days if before_days > 0 else 0
        good_k_after = good_count(after['–Ω–∞–∑–≤–∞–Ω–∏–µ_–º–∏—Å—Å–∏–∏']) / after_days if after_days > 0 else 0

        results.append({
            '–∫–æ–¥_—Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞': employee_id,
            'good_k_before': good_k_before,
            'good_k_after': good_k_after
        })

    temp_df = pd.DataFrame(results).dropna()

    stat, p_value = wilcoxon(temp_df['good_k_before'], temp_df['good_k_after'])

    print("üîé –ü—Ä–æ–≤–µ—Ä–∫–∞ –æ–±—â–µ–π –≤—Ä–µ–º–µ–Ω–Ω–æ–π —Ç–µ–Ω–¥–µ–Ω—Ü–∏–∏ (–¥–æ / –ø–æ—Å–ª–µ –º–µ–¥–∏–∞–Ω–Ω–æ–π –¥–∞—Ç—ã):")
    print(f"Wilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = {stat}, p-value = {p_value:.5f}")
    if p_value < 0.05:
        print("–ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É –î–û –∏ –ü–û–°–õ–ï –ø–æ –≤—Ä–µ–º–µ–Ω–∏")
    else:
        print("–†–∞–∑–Ω–∏—Ü—ã –ø–æ –≤—Ä–µ–º–µ–Ω–∏ –Ω–µ –æ–±–Ω–∞—Ä—É–∂–µ–Ω–æ")

    print()
    print(f"–¥–æ : {temp_df['good_k_before'].median()}")
    print(f"–ø–æ—Å–ª–µ : {temp_df['good_k_after'].median()}")

temporal_sravnenie(income)


üîé –ü—Ä–æ–≤–µ—Ä–∫–∞ –æ–±—â–µ–π –≤—Ä–µ–º–µ–Ω–Ω–æ–π —Ç–µ–Ω–¥–µ–Ω—Ü–∏–∏ (–¥–æ / –ø–æ—Å–ª–µ –º–µ–¥–∏–∞–Ω–Ω–æ–π –¥–∞—Ç—ã):
Wilcoxon test: —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ = 1540260.0, p-value = 0.00000
üìà –ï—Å—Ç—å —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏ –∑–Ω–∞—á–∏–º–∞—è —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É –î–û –∏ –ü–û–°–õ–ï –ø–æ –≤—Ä–µ–º–µ–Ω–∏

–¥–æ : 0.010101010101010102
–ø–æ—Å–ª–µ : 0.012461180124223602
