In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings('ignore')



In [2]:
data = pd.read_csv('../data/data.csv', sep=',', encoding='utf8', parse_dates=['bdate_cl'])
data['region'] = np.nan
data

Unnamed: 0,#,orc_id,fio,age,bdate,bplace,nationality,gender,mstatus,address,...,description,mstatus_cl,edu_lvl_cl,mil_rank_cl,bdate_cl,mil_date_cl,pass_date_cl,contract_date_cl,contract_end_date_cl,region
0,1,439,Абрамцов Владимир Сергеевич,30.0,1991-08-19,,,m,,,...,,,,Сержант,1991-08-19,,2015-08-04,,,
1,2,452,Фаттахов Вагиф Нариман-оглы,32.0,1990-04-01,,,m,,,...,,,,Сержант,1990-04-01,,2010-05-13,,,
2,3,456,Тушевский Иван Георгиевич,25.0,1996-02-09,,,m,,,...,,,,Сержант,1996-02-09,,2010-03-11,,,
3,4,458,Мезенин Виктор Вячеславович,27.0,1994-12-15,,,m,,,...,,,,Сержант,1994-12-15,,2009-03-05,,,
4,5,97567,Худяк Денис Сергеевич,33.0,1989-02-06,,,m,,,...,,,,,1989-02-06,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110067,110068,99495,Черненко Константин Игоревич,33.0,1988-08-28,,,m,,,...,,,,,1988-08-28,,,,,
110068,110069,109315,Арчинеков Вячеслав Владимирович,,1982-09-23,,,m,,,...,,,,Матрос,1982-09-23,,2015-06-29,,,
110069,110070,109320,Мирошниченко Александр Анатольевич,,1985-02-28,,,m,,,...,,,,,1985-02-28,,,,,
110070,110071,109686,Сорокин Дмитрий Андреевич,,1991-08-06,,,m,,,...,,,,,1991-08-06,,,,,


In [3]:
data['nationality'] = data['nationality'].astype(str)

In [4]:
def get_missing_data_stats(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isna().mean() * 100).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [5]:
areas = \
['Республика Адыгея', 
'Республика Башкортостан', 
'Республика Бурятия', 
'Республика Алтай', 
'Республика Дагестан', 
'Республика Ингушетия', 
'Кабардино-Балкарская Республика', 
'Республика Калмыкия', 
'Карачаево-Черкесская Республика', 
'Республика Карелия', 
'Республика Коми', 
'Республика Марий Эл', 
'Республика Мордовия', 
'Республика Саха (Якутия)', 
'Республика Северная Осетия', 
'Республика Татарстан', 
'Республика Тыва', 
'Удмуртская Республика', 
'Республика Хакасия', 
'Чеченская Республика', 
'Чувашская Республика - Чувашия', 
'Алтайский край', 
'Краснодарский край', 
'Красноярский край', 
'Приморский край', 
'Ставропольский край', 
'Хабаровский край', 
'Амурская область', 
'Архангельская область', 
'Астраханская область', 
'Белгородская область', 
'Брянская область', 
'Владимирская область', 
'Волгоградская область', 
'Вологодская область', 
'Воронежская область', 
'Ивановская область', 
'Иркутская область', 
'Калининградская область', 
'Калужская область', 
'Камчатский край', 
'Кемеровская область', 
'Кировская область', 
'Костромская область', 
'Курганская область', 
'Курская область', 
'Ленинградская область', 
'Липецкая область', 
'Магаданская область', 
'Московская область', 
'Мурманская область', 
'Нижегородская область', 
'Новгородская область', 
'Новосибирская область', 
'Омская область', 
'Оренбургская область', 
'Орловская область', 
'Пензенская область', 
'Пермский край', 
'Псковская область', 
'Ростовская область', 
'Рязанская область', 
'Самарская область', 
'Саратовская область', 
'Сахалинская область', 
'Свердловская область', 
'Смоленская область', 
'Тамбовская область', 
'Тверская область', 
'Томская область', 
'Тульская область', 
'Тюменская область', 
'Ульяновская область', 
'Челябинская область', 
'Забайкальский край', 
'Ярославская область', 
'Москва', 
'Санкт-Петербург', 
'Еврейская автономная область', 
'Ненецкий автономный округ', 
'Ханты-Мансийский автономный округ', 
'Чукотский автономный округ', 
'Ямало-Ненецкий автономный округ'
]

In [6]:
for i in data.index:
    try:
        if not pd.isna(data['bplace'][i]):
                data.loc[i, 'region'] =  data.loc[i, 'bplace']
        else:
            if not pd.isna(data['reg_address'][i]):
                data.loc[i, 'region'] =  data.loc[i, 'reg_address']
            else:
                if not pd.isna(data['address'][i]):
                    data.loc[i, 'region'] =  data.loc[i, 'address']
                else:
                    if not pd.isna(data['pass_auth'][i]):
                        data.loc[i, 'region'] =  data.loc[i, 'pass_auth'] 
        ratios = {j: fuzz.token_set_ratio(data['region'][i], areas[j]) for j in range(len(areas))}
        data.loc[i, 'region'] = areas[max(ratios, key=ratios.get)]
    except:
        pass

In [7]:
data = data[['age', 'region', 'nationality', 'gender', 'mstatus_cl', 'mil_rank_cl', 'edu_lvl_cl']]
data

Unnamed: 0,age,region,nationality,gender,mstatus_cl,mil_rank_cl,edu_lvl_cl
0,30.0,Хабаровский край,,m,,Сержант,
1,32.0,Амурская область,,m,,Сержант,
2,25.0,Хабаровский край,,m,,Сержант,
3,27.0,Республика Бурятия,,m,,Сержант,
4,33.0,Республика Адыгея,,m,,,
...,...,...,...,...,...,...,...
110067,33.0,Республика Адыгея,,m,,,
110068,,Кабардино-Балкарская Республика,,m,,Матрос,
110069,,Республика Адыгея,,m,,,
110070,,Республика Адыгея,,m,,,


In [8]:
def clear_mil_rank(string):
    match string:
        case "Рядовой":
            return 1
        case "Матрос":
            return 2
        case "Ефрейтор":
            return 3
        case "Старший Матрос":
            return 4
        case "Младший Сержант":
            return 5
        case "Старшина 2 статьи":
            return 6
        case "Сержант":
            return 7
        case "Старшина Сержант":
            return 8
        case "Старший Сержант":
            return 8
        case "Старшина":
            return 9
        case "Прапорщик":
            return 10
        case "Мичман":
            return 11
        case "Старший Прапорщик":
            return 12
        case "Старший Мичман":
            return 13
        case "Лейтенант":
            return 14
        case "Старший Лейтенант":
            return 15
        case "Капитан":
            return 16
        case "Капитан-лейтенант":
            return 17
        case "Майор":
            return 18
        case "Капитан 3 ранга":
            return 19
        case "Подполковник":
            return 20
        case "Старший Офицер":
            return 21
        case "Капитан 2 ранга":
            return 22
        case "Полковник":
            return 23
        case "Капитан 1 ранга":
            return 24
        case "Генерал-майор":
            return 25
        case "Контр-адмирал":
            return 26

    return 0

def clear_edu_lvl(string):
    match string:
        case "Среднее":
            return 1
        case "Среднее полное":
            return 2
        case "Начальное профессиональное":
            return 3
        case "Среднее техническое":
            return 4
        case "Среднее специальное":
            return 5
        case "Среднее профессиональное":
            return 6
        case "Высшее":
            return 7
        case "ВУЦ":
            return 8
        case "ВВУЗ":
            return 9

    return 0

def clear_nationality(string):
    if (string == "русский" or string == "руский" or string == "Русский" or string == "русская"):
        return "русский"
    elif (string == "бурят " or string == "Бурят " or string == "бурят"):
        return "бурят"
    elif (string == "азербайджанец" or string == "азербейджанец"):
        return "азербайджанец"
    else:
        return string.lower()

data['edu_lvl'] = data['edu_lvl_cl'].apply(clear_edu_lvl)
data['mil_rank'] = data['mil_rank_cl'].apply(clear_mil_rank)
data['nationality'] = data['nationality'].apply(clear_nationality)
data = data.drop(columns=['edu_lvl_cl', 'mil_rank_cl'])
data

Unnamed: 0,age,region,nationality,gender,mstatus_cl,edu_lvl,mil_rank
0,30.0,Хабаровский край,,m,,0,7
1,32.0,Амурская область,,m,,0,7
2,25.0,Хабаровский край,,m,,0,7
3,27.0,Республика Бурятия,,m,,0,7
4,33.0,Республика Адыгея,,m,,0,0
...,...,...,...,...,...,...,...
110067,33.0,Республика Адыгея,,m,,0,0
110068,,Кабардино-Балкарская Республика,,m,,0,2
110069,,Республика Адыгея,,m,,0,0
110070,,Республика Адыгея,,m,,0,0


In [9]:
data = data.reset_index(drop=True)
data = pd.get_dummies(data)
columns = data.columns
data

Unnamed: 0,age,edu_lvl,mil_rank,region_Алтайский край,region_Амурская область,region_Архангельская область,region_Астраханская область,region_Белгородская область,region_Брянская область,region_Владимирская область,...,nationality_украинец,nationality_чуваш,gender_f,gender_m,gender_неизвестн.,mstatus_cl_Женат,mstatus_cl_Замужем,mstatus_cl_Не замужем,mstatus_cl_Разведен,mstatus_cl_Холост
0,30.0,0,7,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,32.0,0,7,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,25.0,0,7,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,27.0,0,7,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,33.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110067,33.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
110068,,0,2,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
110069,,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
110070,,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
imputer = KNNImputer(n_neighbors=20)
data = imputer.fit_transform(data)
data

array([[30. ,  0. ,  7. , ...,  0. ,  0. ,  0. ],
       [32. ,  0. ,  7. , ...,  0. ,  0. ,  0. ],
       [25. ,  0. ,  7. , ...,  0. ,  0. ,  0. ],
       ...,
       [34.8,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [34.8,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [32. ,  0. ,  3. , ...,  0. ,  0. ,  0. ]])

In [11]:
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,age,edu_lvl,mil_rank,region_Алтайский край,region_Амурская область,region_Архангельская область,region_Астраханская область,region_Белгородская область,region_Брянская область,region_Владимирская область,...,nationality_украинец,nationality_чуваш,gender_f,gender_m,gender_неизвестн.,mstatus_cl_Женат,mstatus_cl_Замужем,mstatus_cl_Не замужем,mstatus_cl_Разведен,mstatus_cl_Холост
0,30.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,32.0,0.0,7.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,25.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,27.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110067,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
110068,26.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
110069,34.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
110070,34.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df.to_csv('../data/preprocessed.csv')