In [None]:
import pandas as pd
import numpy as np
import random 

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import utils

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, auc, accuracy_score, roc_auc_score,f1_score,log_loss,\
classification_report, roc_curve

import warnings
warnings.filterwarnings("ignore");

RAND = 10

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Moscow_hack/df.csv').fillna(0)

In [None]:
df.head(3)

Unnamed: 0,client_id,gender,birth_date,create_date,nonresident_flag,businessman_flag,city,term,contract_sum,product_category_name,...,card_type_name,start_date,fact_close_date,purchase_sum,purchase_count,current_balance_avg_sum,current_balance_sum,current_debit_turn_sum,current_credit_turn_sum,card_type
0,fe60b594364f9f636266ed1ef4f89c32,Ж,1985,2020-01-29,R,0,Гусь-Хрустальный,0.0,0.0,Договор на текущий счет для дебетовой карты,...,Visa Platinum Rewards,2019.0,2021.0,16600.0,71,39700.0,25700.0,220600.0,201000.0,dc
1,3012cabca5885ed53d348d6e57dab5de,М,1951,2009-09-24,R,0,Ишимбай,0.0,0.0,Договор на текущий счет для дебетовой карты,...,VISA Classic,2019.0,0.0,0.0,0,1900.0,1000.0,6200.0,3000.0,dc
2,d871ef96820b6c7a1ada8e01a772724e,Ж,1982,2006-12-25,R,0,Юрга,0.0,0.0,Договор на текущий счет для дебетовой карты,...,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,dc


In [None]:
df.shape

(371583, 21)

In [None]:
def binary_random_array(df, part_zeros=0.2):
  len_df = len(df)
  flag_false = np.zeros(round(len_df*part_zeros))
  flag_true = np.ones(len_df-len(flag_false))
  all_data = np.concatenate((flag_false, flag_true), axis=0)
  random.shuffle(all_data)
  return all_data

In [None]:
def feature_claint(column):
  return [df[column].loc[df['client_id'] == i].iloc[0] for i in train.client_id]

In [None]:
def rand_generate_feature(start, stop, amount):
  return np.array([(random.randint(start, stop)) for i in range(amount)])

In [None]:
train = pd.DataFrame()
train['client_id'] = df.client_id.unique()

In [None]:
card_type_name_debit = df['card_type_name'].loc[df['card_type'] == 'dc'].unique()
len_card_type_name_debit = len(card_type_name_debit)
card_type_name_df = pd.DataFrame({
    'name': card_type_name_debit,
    'monthly_maintenance': rand_generate_feature(0, 500, len_card_type_name_debit), 
    'cashback':rand_generate_feature(0, 1, len_card_type_name_debit),
    'percentage_of_purchases':rand_generate_feature(1, 2, len_card_type_name_debit),
    'percentage_money_account':rand_generate_feature(15, 25, len_card_type_name_debit),
    'procent_on_the_balance':rand_generate_feature(5, 15, len_card_type_name_debit),
    'flag_debit':np.ones(len_card_type_name_debit),
    'loan_interest':np.NaN,
    'procent_credit': np.NaN
})

card_type_name_credit = df['card_type_name'].loc[df['card_type'] == 'cc'].unique()
len_card_type_name_credit = len(card_type_name_credit)

card_type_name_df = pd.concat([card_type_name_df, pd.DataFrame({
    'name': card_type_name_credit,
    'monthly_maintenance': rand_generate_feature(300, 1500, len_card_type_name_credit), 
    'cashback':rand_generate_feature(0, 1, len_card_type_name_credit),
    'percentage_of_purchases':rand_generate_feature(1, 2, len_card_type_name_credit),
    'percentage_money_account':rand_generate_feature(15, 25, len_card_type_name_credit),
    'procent_on_the_balance':rand_generate_feature(5, 15, len_card_type_name_credit),
    'flag_debit':np.zeros(len_card_type_name_credit),
    'loan_interest':rand_generate_feature(10, 30, len_card_type_name_credit),
    'procent_credit': np.NaN
})])

additional_names = np.array(['потребительский', 'ипотека', 'автокредит', 'овердрафт']) #данных нет в data set но есть в описание
len_additional_names = len(additional_names)
card_type_name_df = pd.concat([card_type_name_df, pd.DataFrame({
    'name': additional_names,
    'monthly_maintenance':  np.NaN, 
    'cashback': np.NaN,
    'percentage_of_purchases': np.NaN,
    'percentage_money_account': np.NaN,
    'procent_on_the_balance': np.NaN,
    'flag_debit':  np.NaN,
    'loan_interest': np.NaN,
    'procent_credit': rand_generate_feature(10, 25, len_additional_names)
})])
card_type_name_df = card_type_name_df.fillna(0)

In [None]:
def count_target():
  all_metric_array = []
  for i in train.client_id:
    all_sum_open = 0
    all_sum_close = 0
    for len_id in range(len(df.loc[df['client_id'] == i])):
      contract_sum = df['contract_sum'].loc[df['client_id'] == i].iloc[len_id]
      fact_close_date	= df['start_date'].loc[df['client_id'] == i].iloc[len_id]
      current_balance_avg_sum	= df['current_balance_avg_sum'].loc[df['client_id'] == i].iloc[len_id]
      current_debit_turn_sum	= df['current_debit_turn_sum'].loc[df['client_id'] == i].iloc[len_id]
      current_credit_turn_sum	= df['current_credit_turn_sum'].loc[df['client_id'] == i].iloc[len_id]
      card_type	= df['card_type'].loc[df['client_id'] == i].iloc[len_id]
      card_type_name	= df['card_type_name'].loc[df['client_id'] == i].iloc[len_id]

      
      monthly_maintenance = card_type_name_df['monthly_maintenance'].loc[card_type_name_df['name'] == card_type_name].to_frame().iloc[0][0]
      cashback = card_type_name_df['cashback'].loc[card_type_name_df['name'] == card_type_name].to_frame().iloc[0][0]
      percentage_of_purchases = card_type_name_df['percentage_of_purchases'].loc[card_type_name_df['name'] == card_type_name].to_frame().iloc[0][0]
      percentage_money_account = card_type_name_df['percentage_money_account'].loc[card_type_name_df['name'] == card_type_name].to_frame().iloc[0][0]
      procent_on_the_balance = card_type_name_df['procent_on_the_balance'].loc[card_type_name_df['name'] == card_type_name].to_frame().iloc[0][0]
      flag_debit = card_type_name_df['flag_debit'].loc[card_type_name_df['name'] == card_type_name].to_frame().iloc[0][0]
      loan_interest = card_type_name_df['loan_interest'].loc[card_type_name_df['name'] == card_type_name].to_frame().iloc[0][0]
      procent_credit = card_type_name_df['procent_credit'].loc[card_type_name_df['name'] == card_type_name].to_frame().iloc[0][0]


      if fact_close_date == '0': # счет открыт
        if card_type == 'dc' or card_type == 'cc':
          all_sum_open += monthly_maintenance + (current_debit_turn_sum * (percentage_of_purchases - cashback) * flag_debit + (current_credit_turn_sum * (percentage_of_purchases - cashback) + current_credit_turn_sum * procent_credit) * (1 - flag_debit))/100
        else:
          all_sum_open += contract_sum * procent_credit / 100
      else:
        if card_type == 'dc' or card_type == 'cc':
          all_sum_close += monthly_maintenance + (current_debit_turn_sum * (percentage_of_purchases - cashback) * flag_debit + (current_credit_turn_sum * (percentage_of_purchases - cashback) + current_credit_turn_sum * procent_credit) * (1 - flag_debit))/100
        else:
          all_sum_close += contract_sum * procent_credit / 100 / 12
    all_metric_array.append((all_sum_open, all_sum_close))
  return all_metric_array


In [None]:
df_w = pd.DataFrame()
df_m = pd.DataFrame()
df_w['client_id'] = df.loc[df.gender == 'Ж'].client_id.unique()
df_w['flag_woman'] = np.ones(len(df_w))
df_m['client_id'] = df.loc[df.gender == 'М'].client_id.unique()
df_m['flag_woman'] = np.zeros(len(df_m))
flag_woman = pd.concat([df_w, df_m])
train = train.merge(flag_woman, left_on='client_id', right_on='client_id')

df_w = pd.DataFrame()
df_m = pd.DataFrame()
df_w['client_id'] = df.loc[df.gender == 'Ж'].client_id.unique()
df_w['flag_man'] = np.zeros(len(df_w))
df_m['client_id'] = df.loc[df.gender == 'М'].client_id.unique()
df_m['flag_man'] = np.ones(len(df_m))
flag_man = pd.concat([df_w, df_m])
train = train.merge(flag_man, left_on='client_id', right_on='client_id')


resident = pd.DataFrame()
non_resident = pd.DataFrame()
resident['client_id'] = df.loc[df.nonresident_flag == 'R'].client_id.unique()
resident['flag_resident'] = np.ones(len(resident))
non_resident['client_id'] = df.loc[df.nonresident_flag == 'N'].client_id.unique()
non_resident['flag_resident'] = np.zeros(len(non_resident))
flag_resident = pd.concat([resident, non_resident])
train = train.merge(flag_resident, left_on='client_id', right_on='client_id')

resident = pd.DataFrame()
non_resident = pd.DataFrame()
resident['client_id'] = df.loc[df.nonresident_flag == 'R'].client_id.unique()
resident['flag_non_resident'] = np.zeros(len(resident))
non_resident['client_id'] = df.loc[df.nonresident_flag == 'N'].client_id.unique()
non_resident['flag_non_resident'] = np.ones(len(non_resident))
flag_non_resident = pd.concat([non_resident, resident])
train = train.merge(flag_non_resident, left_on='client_id', right_on='client_id')

In [None]:
train['city'] = feature_claint('city')
# train['birth_date'] = feature_claint('birth_date')
# train['businessman_flag'] = feature_claint('businessman_flag')

In [None]:
train['flag_own_car'] = binary_random_array(train, 0.2)
train['flag_own_realty'] = binary_random_array(train, 0.2)
train['flag_phone'] = binary_random_array(train, 0.2)
train['flag_work_phone'] = binary_random_array(train, 0.2)
train['flag_email'] = binary_random_array(train, 0.2)

train['businessman_flag'] = binary_random_array(train, 0.9)


len_train = len(train)
train['amount_children'] = rand_generate_feature(0, 3, len_train)
train['annual_income'] = rand_generate_feature(270000, 2000000, len_train)
train['salary'] = rand_generate_feature(15000, 300000, len_train)
train['family_income'] = rand_generate_feature(16000, 500000, len_train)

train['birth_date'] = rand_generate_feature(1970, 2004, len_train)


train['family_status'] = np.array([random.choice(['Холост / Не замужем', 'В браке']) for i in range(len_train)])
train['level_education'] = np.array([random.choice(['Дошкольное', 'Начальное', 'Основное общее', 'Основное специальноее', 'Высшее']) for i in range(len_train)])


In [None]:
# target = count_target()
target = rand_generate_feature(300, 3000, len_train)

In [None]:
train['target'] = target

In [None]:
train = train.drop('client_id', axis=1)

In [None]:
train.to_csv('train.csv', index=False)