In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from scipy.optimize import minimize
import warnings
warnings.filterwarnings('ignore')

In [3]:
print("VERİ YÜKLEME")

PATHS = {
    'customer_history': '/kaggle/input/ing-hubs-turkiye-datathon/customer_history.csv',
    'customers': '/kaggle/input/ing-hubs-turkiye-datathon/customers.csv',
    'reference_data': '/kaggle/input/ing-hubs-turkiye-datathon/referance_data.csv',
    'reference_data_test': '/kaggle/input/ing-hubs-turkiye-datathon/referance_data_test.csv',
}

customer_history = pd.read_csv(PATHS['customer_history'])
customers = pd.read_csv(PATHS['customers'])
reference_data = pd.read_csv(PATHS['reference_data'])
reference_data_test = pd.read_csv(PATHS['reference_data_test'])

print(f"customer_history shape: {customer_history.shape}")
print(f"customers shape: {customers.shape}")
print(f"reference_data (train) shape: {reference_data.shape}")
print(f"reference_data_test shape: {reference_data_test.shape}")

VERİ YÜKLEME
customer_history shape: (5359609, 7)
customers shape: (176293, 8)
reference_data (train) shape: (133287, 3)
reference_data_test shape: (43006, 2)


EDA

In [7]:
customer_history.head()

Unnamed: 0,cust_id,date,mobile_eft_all_cnt,active_product_category_nbr,mobile_eft_all_amt,cc_transaction_all_amt,cc_transaction_all_cnt
0,0,2016-01-01,1.0,2,151.2,,
1,0,2016-02-01,1.0,2,178.7,,
2,0,2016-03-01,2.0,2,37.38,,
3,0,2016-04-01,4.0,2,100.9,,
4,0,2016-05-01,3.0,3,132.28,,


In [5]:
customers.head()

Unnamed: 0,cust_id,gender,age,province,religion,work_type,work_sector,tenure
0,0,F,64,NOH,U,Part-time,Technology,135
1,1,F,57,ZUI,O,Full-time,Finance,65
2,2,F,62,NOB,M,Self-employed,Healthcare,224
3,3,F,22,ZUI,C,Student,,47
4,5,M,27,ZUI,U,Full-time,Finance,108


In [8]:
reference_data.head()

Unnamed: 0,cust_id,ref_date,churn
0,0,2017-09-01,0
1,3,2018-10-01,0
2,5,2018-03-01,1
3,6,2018-04-01,1
4,7,2018-05-01,0


In [24]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### info #####################")
    print(dataframe.info())
    print("##################### Quantiles #####################")
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [25]:
datas = [customer_history, customers, reference_data]

for d in datas:
    check_df(d)

##################### Shape #####################
(5359609, 7)
##################### Types #####################
cust_id                          int64
date                            object
mobile_eft_all_cnt             float64
active_product_category_nbr      int64
mobile_eft_all_amt             float64
cc_transaction_all_amt         float64
cc_transaction_all_cnt         float64
dtype: object
##################### NA #####################
cust_id                             0
date                                0
mobile_eft_all_cnt             112334
active_product_category_nbr         0
mobile_eft_all_amt             112334
cc_transaction_all_amt         166746
cc_transaction_all_cnt         166746
dtype: int64
##################### info #####################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5359609 entries, 0 to 5359608
Data columns (total 7 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   cust_id                 

In [23]:
customer_history["cust_id"].nunique()

176293

FEATURE ENGINEERING

In [11]:
def create_customer_history_features(df):
    
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    
    features_list = []
    
    for cust_id in df['cust_id'].unique():
        cust_data = df[df['cust_id'] == cust_id].sort_values('date')
        
        feat = {'cust_id': cust_id}
        
        # ===========================
        # MOBİL EFT FEATURELAR
        # ===========================
        # Sayı bazlı
        feat['mobile_eft_cnt_sum'] = cust_data['mobile_eft_all_cnt'].sum()
        feat['mobile_eft_cnt_mean'] = cust_data['mobile_eft_all_cnt'].mean()
        feat['mobile_eft_cnt_std'] = cust_data['mobile_eft_all_cnt'].std()
        feat['mobile_eft_cnt_max'] = cust_data['mobile_eft_all_cnt'].max()
        feat['mobile_eft_cnt_min'] = cust_data['mobile_eft_all_cnt'].min()
        feat['mobile_eft_cnt_median'] = cust_data['mobile_eft_all_cnt'].median()
        feat['mobile_eft_cnt_last'] = cust_data['mobile_eft_all_cnt'].iloc[-1]
        feat['mobile_eft_cnt_first'] = cust_data['mobile_eft_all_cnt'].iloc[0]
        
        # Tutar bazlı
        feat['mobile_eft_amt_sum'] = cust_data['mobile_eft_all_amt'].sum()
        feat['mobile_eft_amt_mean'] = cust_data['mobile_eft_all_amt'].mean()
        feat['mobile_eft_amt_std'] = cust_data['mobile_eft_all_amt'].std()
        feat['mobile_eft_amt_max'] = cust_data['mobile_eft_all_amt'].max()
        feat['mobile_eft_amt_min'] = cust_data['mobile_eft_all_amt'].min()
        feat['mobile_eft_amt_median'] = cust_data['mobile_eft_all_amt'].median()
        feat['mobile_eft_amt_last'] = cust_data['mobile_eft_all_amt'].iloc[-1]
        feat['mobile_eft_amt_first'] = cust_data['mobile_eft_all_amt'].iloc[0]
        
        # Trend ve değişim
        feat['mobile_eft_cnt_trend'] = feat['mobile_eft_cnt_last'] - feat['mobile_eft_cnt_first']
        feat['mobile_eft_amt_trend'] = feat['mobile_eft_amt_last'] - feat['mobile_eft_amt_first']
        feat['mobile_eft_cnt_growth'] = feat['mobile_eft_cnt_trend'] / (feat['mobile_eft_cnt_first'] + 1)
        feat['mobile_eft_amt_growth'] = feat['mobile_eft_amt_trend'] / (feat['mobile_eft_amt_first'] + 1)
        
        # Ortalama işlem tutarı
        feat['mobile_eft_avg_transaction'] = feat['mobile_eft_amt_sum'] / (feat['mobile_eft_cnt_sum'] + 1)
        feat['mobile_eft_volatility'] = feat['mobile_eft_amt_std'] / (feat['mobile_eft_amt_mean'] + 1)
        
        # ===========================
        # KREDİ KARTI FEATURELAR
        # ===========================
        # Sayı bazlı
        feat['cc_transaction_cnt_sum'] = cust_data['cc_transaction_all_cnt'].sum()
        feat['cc_transaction_cnt_mean'] = cust_data['cc_transaction_all_cnt'].mean()
        feat['cc_transaction_cnt_std'] = cust_data['cc_transaction_all_cnt'].std()
        feat['cc_transaction_cnt_max'] = cust_data['cc_transaction_all_cnt'].max()
        feat['cc_transaction_cnt_min'] = cust_data['cc_transaction_all_cnt'].min()
        feat['cc_transaction_cnt_median'] = cust_data['cc_transaction_all_cnt'].median()
        feat['cc_transaction_cnt_last'] = cust_data['cc_transaction_all_cnt'].iloc[-1]
        feat['cc_transaction_cnt_first'] = cust_data['cc_transaction_all_cnt'].iloc[0]
        
        # Tutar bazlı
        feat['cc_transaction_amt_sum'] = cust_data['cc_transaction_all_amt'].sum()
        feat['cc_transaction_amt_mean'] = cust_data['cc_transaction_all_amt'].mean()
        feat['cc_transaction_amt_std'] = cust_data['cc_transaction_all_amt'].std()
        feat['cc_transaction_amt_max'] = cust_data['cc_transaction_all_amt'].max()
        feat['cc_transaction_amt_min'] = cust_data['cc_transaction_all_amt'].min()
        feat['cc_transaction_amt_median'] = cust_data['cc_transaction_all_amt'].median()
        feat['cc_transaction_amt_last'] = cust_data['cc_transaction_all_amt'].iloc[-1]
        feat['cc_transaction_amt_first'] = cust_data['cc_transaction_all_amt'].iloc[0]
        
        # Trend ve değişim
        feat['cc_transaction_cnt_trend'] = feat['cc_transaction_cnt_last'] - feat['cc_transaction_cnt_first']
        feat['cc_transaction_amt_trend'] = feat['cc_transaction_amt_last'] - feat['cc_transaction_amt_first']
        feat['cc_transaction_cnt_growth'] = feat['cc_transaction_cnt_trend'] / (feat['cc_transaction_cnt_first'] + 1)
        feat['cc_transaction_amt_growth'] = feat['cc_transaction_amt_trend'] / (feat['cc_transaction_amt_first'] + 1)
        
        # Ortalama işlem tutarı
        feat['cc_avg_transaction'] = feat['cc_transaction_amt_sum'] / (feat['cc_transaction_cnt_sum'] + 1)
        feat['cc_volatility'] = feat['cc_transaction_amt_std'] / (feat['cc_transaction_amt_mean'] + 1)
        
        # ===========================
        # AKTİF ÜRÜN KATEGORİSİ FEATURELAR
        # ===========================
        feat['active_product_sum'] = cust_data['active_product_category_nbr'].sum()
        feat['active_product_mean'] = cust_data['active_product_category_nbr'].mean()
        feat['active_product_std'] = cust_data['active_product_category_nbr'].std()
        feat['active_product_max'] = cust_data['active_product_category_nbr'].max()
        feat['active_product_min'] = cust_data['active_product_category_nbr'].min()
        feat['active_product_median'] = cust_data['active_product_category_nbr'].median()
        feat['active_product_last'] = cust_data['active_product_category_nbr'].iloc[-1]
        feat['active_product_first'] = cust_data['active_product_category_nbr'].iloc[0]
        feat['active_product_trend'] = feat['active_product_last'] - feat['active_product_first']
        feat['active_product_growth'] = feat['active_product_trend'] / (feat['active_product_first'] + 1)
        
        # ===========================
        # (EFT vs CC)
        # ===========================
        feat['total_transaction_cnt'] = feat['mobile_eft_cnt_sum'] + feat['cc_transaction_cnt_sum']
        feat['total_transaction_amt'] = feat['mobile_eft_amt_sum'] + feat['cc_transaction_amt_sum']
        feat['eft_to_cc_cnt_ratio'] = feat['mobile_eft_cnt_sum'] / (feat['cc_transaction_cnt_sum'] + 1)
        feat['eft_to_cc_amt_ratio'] = feat['mobile_eft_amt_sum'] / (feat['cc_transaction_amt_sum'] + 1)
        feat['cc_to_total_cnt_ratio'] = feat['cc_transaction_cnt_sum'] / (feat['total_transaction_cnt'] + 1)
        feat['cc_to_total_amt_ratio'] = feat['cc_transaction_amt_sum'] / (feat['total_transaction_amt'] + 1)
        feat['eft_to_total_cnt_ratio'] = feat['mobile_eft_cnt_sum'] / (feat['total_transaction_cnt'] + 1)
        feat['eft_to_total_amt_ratio'] = feat['mobile_eft_amt_sum'] / (feat['total_transaction_amt'] + 1)
        
        # ===========================
        # ZAMAN SERİSİ FEATURELAR
        # ===========================
        feat['num_months'] = len(cust_data)
        feat['months_since_first'] = (cust_data['date'].max() - cust_data['date'].min()).days / 30
        feat['avg_months_between_records'] = feat['months_since_first'] / (feat['num_months'] + 1)
        
        # Son 3, 6 ay 
        last_3_months = cust_data.tail(3)
        last_6_months = cust_data.tail(6)
        
        feat['mobile_eft_amt_last_3m'] = last_3_months['mobile_eft_all_amt'].sum()
        feat['mobile_eft_amt_last_6m'] = last_6_months['mobile_eft_all_amt'].sum()
        feat['cc_amt_last_3m'] = last_3_months['cc_transaction_all_amt'].sum()
        feat['cc_amt_last_6m'] = last_6_months['cc_transaction_all_amt'].sum()
        
        feat['mobile_eft_cnt_last_3m'] = last_3_months['mobile_eft_all_cnt'].sum()
        feat['mobile_eft_cnt_last_6m'] = last_6_months['mobile_eft_all_cnt'].sum()
        feat['cc_cnt_last_3m'] = last_3_months['cc_transaction_all_cnt'].sum()
        feat['cc_cnt_last_6m'] = last_6_months['cc_transaction_all_cnt'].sum()
        
        feat['active_product_last_3m'] = last_3_months['active_product_category_nbr'].mean()
        feat['active_product_last_6m'] = last_6_months['active_product_category_nbr'].mean()
        
        # Son dönem vs genel ortalama karşılaştırması
        feat['mobile_eft_amt_last3m_to_avg'] = feat['mobile_eft_amt_last_3m'] / (feat['mobile_eft_amt_mean'] * 3 + 1)
        feat['cc_amt_last3m_to_avg'] = feat['cc_amt_last_3m'] / (feat['cc_transaction_amt_mean'] * 3 + 1)
        
        # ===========================
        # AKTİVİTE VE ENGAGEMENT FEATURELAR
        # ===========================
        # Sıfır aktivite ayları
        feat['months_zero_mobile_eft'] = (cust_data['mobile_eft_all_cnt'] == 0).sum()
        feat['months_zero_cc'] = (cust_data['cc_transaction_all_cnt'] == 0).sum()
        feat['months_zero_both'] = ((cust_data['mobile_eft_all_cnt'] == 0) & 
                                     (cust_data['cc_transaction_all_cnt'] == 0)).sum()
        
        feat['zero_activity_ratio'] = feat['months_zero_both'] / feat['num_months']
        feat['mobile_eft_activity_ratio'] = 1 - (feat['months_zero_mobile_eft'] / feat['num_months'])
        feat['cc_activity_ratio'] = 1 - (feat['months_zero_cc'] / feat['num_months'])
        
        # ===========================
        # İLERİ SEVİYE İSTATİSTİKLER
        # ===========================
        # Skewness ve Kurtosis
        feat['mobile_eft_amt_skew'] = cust_data['mobile_eft_all_amt'].skew()
        feat['mobile_eft_amt_kurt'] = cust_data['mobile_eft_all_amt'].kurtosis()
        feat['cc_amt_skew'] = cust_data['cc_transaction_all_amt'].skew()
        feat['cc_amt_kurt'] = cust_data['cc_transaction_all_amt'].kurtosis()
        
        # Percentiles
        feat['mobile_eft_amt_q25'] = cust_data['mobile_eft_all_amt'].quantile(0.25)
        feat['mobile_eft_amt_q75'] = cust_data['mobile_eft_all_amt'].quantile(0.75)
        feat['mobile_eft_amt_iqr'] = feat['mobile_eft_amt_q75'] - feat['mobile_eft_amt_q25']
        
        feat['cc_amt_q25'] = cust_data['cc_transaction_all_amt'].quantile(0.25)
        feat['cc_amt_q75'] = cust_data['cc_transaction_all_amt'].quantile(0.75)
        feat['cc_amt_iqr'] = feat['cc_amt_q75'] - feat['cc_amt_q25']
        
        # ===========================
        # MOMENTUM VE İVME
        # ===========================
        if len(cust_data) >= 3:
            # Son 3 ayın ortalaması vs önceki ayların ortalaması
            recent = cust_data.tail(3)
            previous = cust_data.head(-3) if len(cust_data) > 3 else cust_data.head(3)
            
            feat['mobile_eft_amt_momentum'] = recent['mobile_eft_all_amt'].mean() - previous['mobile_eft_all_amt'].mean()
            feat['cc_amt_momentum'] = recent['cc_transaction_all_amt'].mean() - previous['cc_transaction_all_amt'].mean()
            feat['active_product_momentum'] = recent['active_product_category_nbr'].mean() - previous['active_product_category_nbr'].mean()
        
        features_list.append(feat)
    
    features_df = pd.DataFrame(features_list)
    
    # NaN ve Inf temizle
    features_df = features_df.replace([np.inf, -np.inf], np.nan)
    features_df = features_df.fillna(0)
    
    print(f"Customer history'den {len(features_df.columns)-1} feature oluşturuldu")
    return features_df

print("FEATURELAR OLUŞTURULDU")

FEATURELAR OLUŞTURULDU


In [12]:
print("CUSTOMER HISTORY FEATURE ÇIKARIMI")


customer_features = create_customer_history_features(customer_history)

print(f"Customer features oluşturuldu: {customer_features.shape}")

customer_features.head()

CUSTOMER HISTORY FEATURE ÇIKARIMI
Customer history'den 96 feature oluşturuldu
Customer features oluşturuldu: (176293, 97)


Unnamed: 0,cust_id,mobile_eft_cnt_sum,mobile_eft_cnt_mean,mobile_eft_cnt_std,mobile_eft_cnt_max,mobile_eft_cnt_min,mobile_eft_cnt_median,mobile_eft_cnt_last,mobile_eft_cnt_first,mobile_eft_amt_sum,...,cc_amt_kurt,mobile_eft_amt_q25,mobile_eft_amt_q75,mobile_eft_amt_iqr,cc_amt_q25,cc_amt_q75,cc_amt_iqr,mobile_eft_amt_momentum,cc_amt_momentum,active_product_momentum
0,0,47.0,2.238095,1.220851,5.0,1.0,2.0,1.0,1.0,2578.44,...,0.0,93.8,137.17,43.37,0.0,0.0,0.0,125.510556,0.0,-0.055556
1,1,61.0,1.605263,1.326232,5.0,0.0,1.0,0.0,5.0,662.14,...,3.727115,4.545,20.875,16.33,73.565,300.9,227.335,-17.655238,-194.446286,0.028571
2,2,87.0,2.351351,1.398734,5.0,1.0,2.0,1.0,4.0,22569.97,...,-1.704197,151.14,987.47,836.33,0.0,15.88,15.88,133.095686,4.762941,0.176471
3,3,57.0,1.676471,1.006662,4.0,1.0,1.0,2.0,4.0,2470.61,...,0.15497,8.415,88.88,80.465,294.3575,817.725,523.3675,310.191505,506.637419,0.0
4,5,69.0,2.555556,1.476309,6.0,1.0,2.0,4.0,4.0,10433.95,...,9.736606,149.215,562.42,413.205,14.015,50.925,36.91,755.715833,-88.6625,0.0


In [31]:
def create_demographic_features(df):
    
    features = df.copy()
    
    # ===========================
    # YAŞ FEATURELARI
    # ===========================
    if 'age' in features.columns:
        features['age_squared'] = features['age'] ** 2
        features['age_log'] = np.log1p(features['age'])
        features['age_sqrt'] = np.sqrt(features['age'])
        
        # Yaş grupları
        features['age_group'] = pd.cut(features['age'], 
                                       bins=[0, 25, 35, 45, 55, 65, 100],
                                       labels=[1, 2, 3, 4, 5, 6])
        features['age_group'] = features['age_group'].astype(int)
        
        features['is_young'] = (features['age'] <= 30).astype(int)
        features['is_middle_aged'] = ((features['age'] > 30) & (features['age'] <= 50)).astype(int)
        features['is_senior'] = (features['age'] > 50).astype(int)
        features['is_very_young'] = (features['age'] <= 25).astype(int)
        features['is_retirement_age'] = (features['age'] >= 60).astype(int)
    
    # ===========================
    # TENURE (MÜŞTERİLİK SÜRESİ) FEATURELARI
    # ===========================
    if 'tenure' in features.columns:
        features['tenure_years'] = features['tenure'] / 12
        features['tenure_squared'] = features['tenure'] ** 2
        features['tenure_log'] = np.log1p(features['tenure'])
        features['tenure_sqrt'] = np.sqrt(features['tenure'])
        
        # Tenure grupları
        features['tenure_group'] = pd.cut(features['tenure'],
                                         bins=[0, 12, 24, 36, 60, 120, 1000],
                                         labels=[1, 2, 3, 4, 5, 6])
        features['tenure_group'] = features['tenure_group'].astype(int)
        
        features['is_new_customer'] = (features['tenure'] <= 12).astype(int)
        features['is_very_new'] = (features['tenure'] <= 6).astype(int)
        features['is_established'] = ((features['tenure'] > 12) & (features['tenure'] <= 36)).astype(int)
        features['is_loyal'] = (features['tenure'] > 36).astype(int)
        features['is_very_loyal'] = (features['tenure'] > 60).astype(int)
        
        # Yaş ve tenure kombinasyonu
        if 'age' in features.columns:
            features['age_to_tenure_ratio'] = features['age'] / (features['tenure'] + 1)
            features['tenure_per_age'] = features['tenure'] / features['age']
            features['customer_lifetime_stage'] = features['age_group'] * features['tenure_group']
    
    # ===========================
    # KATEGORİK DEĞİŞKENLERİ ENCODE ET
    # ===========================
    categorical_cols = ['gender', 'province', 'religion', 'work_type', 'work_sector']
    
    label_encoders = {}
    for col in categorical_cols:
        if col in features.columns:
            le = LabelEncoder()
            features[f'{col}_encoded'] = le.fit_transform(features[col].astype(str))
            label_encoders[col] = le
            
            # Frequency encoding
            freq = features[col].value_counts(normalize=True)
            features[f'{col}_frequency'] = features[col].map(freq)
            
            # Count encoding
            count = features[col].value_counts()
            features[f'{col}_count'] = features[col].map(count)
    
    # ===========================
    # KOMBİNASYON FEATURELARI
    # ===========================
    # Gender ve work type kombinasyonu
    if 'gender' in features.columns and 'work_type' in features.columns:
        features['gender_work_interaction'] = features['gender_encoded'] * features['work_type_encoded']
    
    # Province ve religion kombinasyonu
    if 'province' in features.columns and 'religion' in features.columns:
        features['province_religion_interaction'] = features['province_encoded'] * features['religion_encoded']
    
    # Age ve work sector kombinasyonu
    if 'age' in features.columns and 'work_sector' in features.columns:
        features['age_work_sector_interaction'] = features['age'] * features['work_sector_encoded']
    
    # Orijinal kategorik kolonları çıkar
    features = features.drop(categorical_cols, axis=1, errors='ignore')
    
   
    return features

print("Demografik özellik fonksiyonu tanımlandı!")

Demografik özellik fonksiyonu tanımlandı!


In [32]:
print("CUSTOMERS DEMOGRAFİK ÖZELLİKLER")
demographic_features = create_demographic_features(customers)

print(f"Demographic features hazır: {demographic_features.shape}")
print(demographic_features.head())

CUSTOMERS DEMOGRAFİK ÖZELLİKLER
Demographic features hazır: (176293, 43)
   cust_id  age  tenure  age_squared   age_log  age_sqrt  age_group  is_young  \
0        0   64     135         4096  4.174387  8.000000          5         0   
1        1   57      65         3249  4.060443  7.549834          5         0   
2        2   62     224         3844  4.143135  7.874008          5         0   
3        3   22      47          484  3.135494  4.690416          1         1   
4        5   27     108          729  3.332205  5.196152          2         1   

   is_middle_aged  is_senior  ...  religion_count  work_type_encoded  \
0               0          1  ...           70559                  1   
1               0          1  ...           17116                  0   
2               0          1  ...           17987                  3   
3               0          0  ...           52542                  4   
4               0          0  ...           70559                  0   

   work

In [33]:
def merge_all_features(reference_df, customer_features, demographic_features):

    df = reference_df.copy()
    
    # Customer history featureları
    df = df.merge(customer_features, on='cust_id', how='left')
    
    # Demographic featurelar
    df = df.merge(demographic_features, on='cust_id', how='left')
    
    # Ref_date featureları
    if 'ref_date' in df.columns:
        df['ref_date'] = pd.to_datetime(df['ref_date'])
        df['ref_year'] = df['ref_date'].dt.year
        df['ref_month'] = df['ref_date'].dt.month
        df['ref_quarter'] = df['ref_date'].dt.quarter
        df['ref_day'] = df['ref_date'].dt.day
        df['ref_dayofweek'] = df['ref_date'].dt.dayofweek
        df['ref_is_month_start'] = df['ref_date'].dt.is_month_start.astype(int)
        df['ref_is_month_end'] = df['ref_date'].dt.is_month_end.astype(int)
        df = df.drop('ref_date', axis=1)
    
    print(f"Toplam özellik sayısı: {df.shape[1]}")
    return df

print("Veri birleştirme fonksiyonu tanımlandı!")

Veri birleştirme fonksiyonu tanımlandı!


In [16]:
print("ÖZELLİKLERİ BİRLEŞTİRME")


# Train ve test için birleştir
train_merged = merge_all_features(reference_data, customer_features, demographic_features)
test_merged = merge_all_features(reference_data_test, customer_features, demographic_features)

print(f"Train merged shape: {train_merged.shape}")
print(f"Test merged shape: {test_merged.shape}")

ÖZELLİKLERİ BİRLEŞTİRME
  ✓ Toplam özellik sayısı: 147
  ✓ Toplam özellik sayısı: 146

✓ Train merged shape: (133287, 147)
✓ Test merged shape: (43006, 146)


In [17]:
def create_customer_history_features(df):
    
    # Tarih featurelarını parse et
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    
    features_list = []
    
    for cust_id in df['cust_id'].unique():
        cust_data = df[df['cust_id'] == cust_id].sort_values('date')
        
        feat = {'cust_id': cust_id}
        
        # ===========================
        # MOBİL EFT FEATURELARI
        # ===========================
        # Sayı bazlı
        feat['mobile_eft_cnt_sum'] = cust_data['mobile_eft_all_cnt'].sum()
        feat['mobile_eft_cnt_mean'] = cust_data['mobile_eft_all_cnt'].mean()
        feat['mobile_eft_cnt_std'] = cust_data['mobile_eft_all_cnt'].std()
        feat['mobile_eft_cnt_max'] = cust_data['mobile_eft_all_cnt'].max()
        feat['mobile_eft_cnt_min'] = cust_data['mobile_eft_all_cnt'].min()
        feat['mobile_eft_cnt_median'] = cust_data['mobile_eft_all_cnt'].median()
        feat['mobile_eft_cnt_last'] = cust_data['mobile_eft_all_cnt'].iloc[-1]
        feat['mobile_eft_cnt_first'] = cust_data['mobile_eft_all_cnt'].iloc[0]
        
        # Tutar bazlı
        feat['mobile_eft_amt_sum'] = cust_data['mobile_eft_all_amt'].sum()
        feat['mobile_eft_amt_mean'] = cust_data['mobile_eft_all_amt'].mean()
        feat['mobile_eft_amt_std'] = cust_data['mobile_eft_all_amt'].std()
        feat['mobile_eft_amt_max'] = cust_data['mobile_eft_all_amt'].max()
        feat['mobile_eft_amt_min'] = cust_data['mobile_eft_all_amt'].min()
        feat['mobile_eft_amt_median'] = cust_data['mobile_eft_all_amt'].median()
        feat['mobile_eft_amt_last'] = cust_data['mobile_eft_all_amt'].iloc[-1]
        feat['mobile_eft_amt_first'] = cust_data['mobile_eft_all_amt'].iloc[0]
        
        # Trend ve değişim
        feat['mobile_eft_cnt_trend'] = feat['mobile_eft_cnt_last'] - feat['mobile_eft_cnt_first']
        feat['mobile_eft_amt_trend'] = feat['mobile_eft_amt_last'] - feat['mobile_eft_amt_first']
        feat['mobile_eft_cnt_growth'] = feat['mobile_eft_cnt_trend'] / (feat['mobile_eft_cnt_first'] + 1)
        feat['mobile_eft_amt_growth'] = feat['mobile_eft_amt_trend'] / (feat['mobile_eft_amt_first'] + 1)
        
        # Ortalama işlem tutarı
        feat['mobile_eft_avg_transaction'] = feat['mobile_eft_amt_sum'] / (feat['mobile_eft_cnt_sum'] + 1)
        feat['mobile_eft_volatility'] = feat['mobile_eft_amt_std'] / (feat['mobile_eft_amt_mean'] + 1)
        
        # ===========================
        # KREDİ KARTI ÖZELLİKLERİ
        # ===========================
        # Sayı bazlı
        feat['cc_transaction_cnt_sum'] = cust_data['cc_transaction_all_cnt'].sum()
        feat['cc_transaction_cnt_mean'] = cust_data['cc_transaction_all_cnt'].mean()
        feat['cc_transaction_cnt_std'] = cust_data['cc_transaction_all_cnt'].std()
        feat['cc_transaction_cnt_max'] = cust_data['cc_transaction_all_cnt'].max()
        feat['cc_transaction_cnt_min'] = cust_data['cc_transaction_all_cnt'].min()
        feat['cc_transaction_cnt_median'] = cust_data['cc_transaction_all_cnt'].median()
        feat['cc_transaction_cnt_last'] = cust_data['cc_transaction_all_cnt'].iloc[-1]
        feat['cc_transaction_cnt_first'] = cust_data['cc_transaction_all_cnt'].iloc[0]
        
        # Tutar bazlı
        feat['cc_transaction_amt_sum'] = cust_data['cc_transaction_all_amt'].sum()
        feat['cc_transaction_amt_mean'] = cust_data['cc_transaction_all_amt'].mean()
        feat['cc_transaction_amt_std'] = cust_data['cc_transaction_all_amt'].std()
        feat['cc_transaction_amt_max'] = cust_data['cc_transaction_all_amt'].max()
        feat['cc_transaction_amt_min'] = cust_data['cc_transaction_all_amt'].min()
        feat['cc_transaction_amt_median'] = cust_data['cc_transaction_all_amt'].median()
        feat['cc_transaction_amt_last'] = cust_data['cc_transaction_all_amt'].iloc[-1]
        feat['cc_transaction_amt_first'] = cust_data['cc_transaction_all_amt'].iloc[0]
        
        # Trend ve değişim
        feat['cc_transaction_cnt_trend'] = feat['cc_transaction_cnt_last'] - feat['cc_transaction_cnt_first']
        feat['cc_transaction_amt_trend'] = feat['cc_transaction_amt_last'] - feat['cc_transaction_amt_first']
        feat['cc_transaction_cnt_growth'] = feat['cc_transaction_cnt_trend'] / (feat['cc_transaction_cnt_first'] + 1)
        feat['cc_transaction_amt_growth'] = feat['cc_transaction_amt_trend'] / (feat['cc_transaction_amt_first'] + 1)
        
        # Ortalama işlem tutarı
        feat['cc_avg_transaction'] = feat['cc_transaction_amt_sum'] / (feat['cc_transaction_cnt_sum'] + 1)
        feat['cc_volatility'] = feat['cc_transaction_amt_std'] / (feat['cc_transaction_amt_mean'] + 1)
        
        # ===========================
        # AKTİF ÜRÜN KATEGORİSİ FEATURELARI
        # ===========================
        feat['active_product_sum'] = cust_data['active_product_category_nbr'].sum()
        feat['active_product_mean'] = cust_data['active_product_category_nbr'].mean()
        feat['active_product_std'] = cust_data['active_product_category_nbr'].std()
        feat['active_product_max'] = cust_data['active_product_category_nbr'].max()
        feat['active_product_min'] = cust_data['active_product_category_nbr'].min()
        feat['active_product_median'] = cust_data['active_product_category_nbr'].median()
        feat['active_product_last'] = cust_data['active_product_category_nbr'].iloc[-1]
        feat['active_product_first'] = cust_data['active_product_category_nbr'].iloc[0]
        feat['active_product_trend'] = feat['active_product_last'] - feat['active_product_first']
        feat['active_product_growth'] = feat['active_product_trend'] / (feat['active_product_first'] + 1)
        
        # ===========================
        #  (EFT vs CC)
        # ===========================
        feat['total_transaction_cnt'] = feat['mobile_eft_cnt_sum'] + feat['cc_transaction_cnt_sum']
        feat['total_transaction_amt'] = feat['mobile_eft_amt_sum'] + feat['cc_transaction_amt_sum']
        feat['eft_to_cc_cnt_ratio'] = feat['mobile_eft_cnt_sum'] / (feat['cc_transaction_cnt_sum'] + 1)
        feat['eft_to_cc_amt_ratio'] = feat['mobile_eft_amt_sum'] / (feat['cc_transaction_amt_sum'] + 1)
        feat['cc_to_total_cnt_ratio'] = feat['cc_transaction_cnt_sum'] / (feat['total_transaction_cnt'] + 1)
        feat['cc_to_total_amt_ratio'] = feat['cc_transaction_amt_sum'] / (feat['total_transaction_amt'] + 1)
        feat['eft_to_total_cnt_ratio'] = feat['mobile_eft_cnt_sum'] / (feat['total_transaction_cnt'] + 1)
        feat['eft_to_total_amt_ratio'] = feat['mobile_eft_amt_sum'] / (feat['total_transaction_amt'] + 1)
        
        # ===========================
        # ZAMAN SERİSİ FEATURELARI
        # ===========================
        feat['num_months'] = len(cust_data)
        feat['months_since_first'] = (cust_data['date'].max() - cust_data['date'].min()).days / 30
        feat['avg_months_between_records'] = feat['months_since_first'] / (feat['num_months'] + 1)
        
        # Son 3, 6 ay 
        last_3_months = cust_data.tail(3)
        last_6_months = cust_data.tail(6)
        
        feat['mobile_eft_amt_last_3m'] = last_3_months['mobile_eft_all_amt'].sum()
        feat['mobile_eft_amt_last_6m'] = last_6_months['mobile_eft_all_amt'].sum()
        feat['cc_amt_last_3m'] = last_3_months['cc_transaction_all_amt'].sum()
        feat['cc_amt_last_6m'] = last_6_months['cc_transaction_all_amt'].sum()
        
        feat['mobile_eft_cnt_last_3m'] = last_3_months['mobile_eft_all_cnt'].sum()
        feat['mobile_eft_cnt_last_6m'] = last_6_months['mobile_eft_all_cnt'].sum()
        feat['cc_cnt_last_3m'] = last_3_months['cc_transaction_all_cnt'].sum()
        feat['cc_cnt_last_6m'] = last_6_months['cc_transaction_all_cnt'].sum()
        
        feat['active_product_last_3m'] = last_3_months['active_product_category_nbr'].mean()
        feat['active_product_last_6m'] = last_6_months['active_product_category_nbr'].mean()
        
        # Son dönem vs genel ortalama karşılaştırması
        feat['mobile_eft_amt_last3m_to_avg'] = feat['mobile_eft_amt_last_3m'] / (feat['mobile_eft_amt_mean'] * 3 + 1)
        feat['cc_amt_last3m_to_avg'] = feat['cc_amt_last_3m'] / (feat['cc_transaction_amt_mean'] * 3 + 1)
        
        # ===========================
        # AKTİVİTE VE ENGAGEMENT FEATURELARI
        # ===========================
        # Sıfır aktivite ayları
        feat['months_zero_mobile_eft'] = (cust_data['mobile_eft_all_cnt'] == 0).sum()
        feat['months_zero_cc'] = (cust_data['cc_transaction_all_cnt'] == 0).sum()
        feat['months_zero_both'] = ((cust_data['mobile_eft_all_cnt'] == 0) & 
                                     (cust_data['cc_transaction_all_cnt'] == 0)).sum()
        
        feat['zero_activity_ratio'] = feat['months_zero_both'] / feat['num_months']
        feat['mobile_eft_activity_ratio'] = 1 - (feat['months_zero_mobile_eft'] / feat['num_months'])
        feat['cc_activity_ratio'] = 1 - (feat['months_zero_cc'] / feat['num_months'])
        
        # ===========================
        # İLERİ SEVİYE İSTATİSTİKLER
        # ===========================
        # Skewness ve Kurtosis
        feat['mobile_eft_amt_skew'] = cust_data['mobile_eft_all_amt'].skew()
        feat['mobile_eft_amt_kurt'] = cust_data['mobile_eft_all_amt'].kurtosis()
        feat['cc_amt_skew'] = cust_data['cc_transaction_all_amt'].skew()
        feat['cc_amt_kurt'] = cust_data['cc_transaction_all_amt'].kurtosis()
        
        # Percentiles
        feat['mobile_eft_amt_q25'] = cust_data['mobile_eft_all_amt'].quantile(0.25)
        feat['mobile_eft_amt_q75'] = cust_data['mobile_eft_all_amt'].quantile(0.75)
        feat['mobile_eft_amt_iqr'] = feat['mobile_eft_amt_q75'] - feat['mobile_eft_amt_q25']
        
        feat['cc_amt_q25'] = cust_data['cc_transaction_all_amt'].quantile(0.25)
        feat['cc_amt_q75'] = cust_data['cc_transaction_all_amt'].quantile(0.75)
        feat['cc_amt_iqr'] = feat['cc_amt_q75'] - feat['cc_amt_q25']
        
        # ===========================
        # MOMENTUM VE İVME
        # ===========================
        if len(cust_data) >= 3:
            # Son 3 ayın ortalaması vs önceki ayların ortalaması
            recent = cust_data.tail(3)
            previous = cust_data.head(-3) if len(cust_data) > 3 else cust_data.head(3)
            
            feat['mobile_eft_amt_momentum'] = recent['mobile_eft_all_amt'].mean() - previous['mobile_eft_all_amt'].mean()
            feat['cc_amt_momentum'] = recent['cc_transaction_all_amt'].mean() - previous['cc_transaction_all_amt'].mean()
            feat['active_product_momentum'] = recent['active_product_category_nbr'].mean() - previous['active_product_category_nbr'].mean()
        
        features_list.append(feat)
    
    features_df = pd.DataFrame(features_list)
    
    # NaN ve Inf temizle
    features_df = features_df.replace([np.inf, -np.inf], np.nan)
    features_df = features_df.fillna(0)
    
    print(f"  ✓ Customer history'den {len(features_df.columns)-1} özellik oluşturuldu")
    return features_df

customer_features = create_customer_history_features(customer_history)

  ✓ Customer history'den 96 özellik oluşturuldu


In [18]:
def ultra_advanced_feature_engineering(df):
    
    df = df.copy()
    
    # ID ve target kolonlarını çıkar
    exclude_cols = ['cust_id', 'churn']
    numeric_cols = [col for col in df.select_dtypes(include=[np.number]).columns 
                    if col not in exclude_cols]
    
    print(f"  Mevcut numeric kolon sayısı: {len(numeric_cols)}")
    
    # =============================
    # SATIR BAZLI İSTATİSTİKLER
    # =============================
    
    df['row_mean'] = df[numeric_cols].mean(axis=1)
    df['row_std'] = df[numeric_cols].std(axis=1)
    df['row_max'] = df[numeric_cols].max(axis=1)
    df['row_min'] = df[numeric_cols].min(axis=1)
    df['row_median'] = df[numeric_cols].median(axis=1)
    df['row_range'] = df['row_max'] - df['row_min']
    df['row_cv'] = df['row_std'] / (df['row_mean'].abs() + 1e-5)
    df['row_skew'] = df[numeric_cols].skew(axis=1)
    df['row_kurtosis'] = df[numeric_cols].kurtosis(axis=1)
    
    # Percentiles
    df['row_q10'] = df[numeric_cols].quantile(0.10, axis=1)
    df['row_q25'] = df[numeric_cols].quantile(0.25, axis=1)
    df['row_q75'] = df[numeric_cols].quantile(0.75, axis=1)
    df['row_q90'] = df[numeric_cols].quantile(0.90, axis=1)
    df['row_iqr'] = df['row_q75'] - df['row_q25']
    
    # =============================
    # DEĞER SAYILARI
    # =============================
    
    df['zero_count'] = (df[numeric_cols] == 0).sum(axis=1)
    df['negative_count'] = (df[numeric_cols] < 0).sum(axis=1)
    df['positive_count'] = (df[numeric_cols] > 0).sum(axis=1)
    df['zero_ratio'] = df['zero_count'] / len(numeric_cols)
    df['negative_ratio'] = df['negative_count'] / len(numeric_cols)
    df['positive_ratio'] = df['positive_count'] / len(numeric_cols)
    
    # =============================
    # GRUP BAZLI ÖZELLIKLER
    # =============================
    # EFT özellikleri grubu
    eft_cols = [col for col in numeric_cols if 'mobile_eft' in col or 'eft' in col]
    if len(eft_cols) > 0:
        df['eft_group_mean'] = df[eft_cols].mean(axis=1)
        df['eft_group_std'] = df[eft_cols].std(axis=1)
        df['eft_group_max'] = df[eft_cols].max(axis=1)
        df['eft_group_sum'] = df[eft_cols].sum(axis=1)
    
    # CC özellikleri grubu
    cc_cols = [col for col in numeric_cols if 'cc' in col]
    if len(cc_cols) > 0:
        df['cc_group_mean'] = df[cc_cols].mean(axis=1)
        df['cc_group_std'] = df[cc_cols].std(axis=1)
        df['cc_group_max'] = df[cc_cols].max(axis=1)
        df['cc_group_sum'] = df[cc_cols].sum(axis=1)
    
    # Product özellikleri grubu
    product_cols = [col for col in numeric_cols if 'product' in col or 'active' in col]
    if len(product_cols) > 0:
        df['product_group_mean'] = df[product_cols].mean(axis=1)
        df['product_group_std'] = df[product_cols].std(axis=1)
        df['product_group_max'] = df[product_cols].max(axis=1)
    
    # =============================
    # ORAN VE YÜZDE ÖZELLİKLERİ
    # =============================
   
    df['mean_to_median_ratio'] = df['row_mean'] / (df['row_median'].abs() + 1e-5)
    df['std_to_mean_ratio'] = df['row_std'] / (df['row_mean'].abs() + 1e-5)
    df['range_to_mean_ratio'] = df['row_range'] / (df['row_mean'].abs() + 1e-5)
    df['max_to_min_ratio'] = df['row_max'] / (df['row_min'].abs() + 1e-5)
    
    # Grup oranları
    if 'eft_group_sum' in df.columns and 'cc_group_sum' in df.columns:
        df['eft_to_cc_ratio'] = df['eft_group_sum'] / (df['cc_group_sum'] + 1e-5)
        df['cc_to_eft_ratio'] = df['cc_group_sum'] / (df['eft_group_sum'] + 1e-5)
        df['eft_dominance'] = df['eft_group_sum'] / (df['eft_group_sum'] + df['cc_group_sum'] + 1e-5)
        df['cc_dominance'] = df['cc_group_sum'] / (df['eft_group_sum'] + df['cc_group_sum'] + 1e-5)
    
    # =============================
    # POLİNOM ÖZELLİKLER
    # =============================
    
    important_cols = numeric_cols[:20]
    for col in important_cols:
        df[f'{col}_squared'] = df[col] ** 2
        df[f'{col}_cubed'] = df[col] ** 3
        df[f'{col}_sqrt'] = np.sqrt(np.abs(df[col]))
        df[f'{col}_log1p'] = np.log1p(np.abs(df[col]))
        df[f'{col}_reciprocal'] = 1 / (df[col].abs() + 1e-5)
    
    # =============================
    # ETKİLEŞİM ÖZELLİKLERİ
    # =============================
   
    
    # En önemli kolonlar arası etkileşim
    top_cols = numeric_cols[:15]
    for i in range(len(top_cols)):
        for j in range(i+1, min(i+3, len(top_cols))):
            col1, col2 = top_cols[i], top_cols[j]
            df[f'{col1}_x_{col2}'] = df[col1] * df[col2]
            df[f'{col1}_div_{col2}'] = df[col1] / (df[col2].abs() + 1e-5)
            df[f'{col1}_plus_{col2}'] = df[col1] + df[col2]
            df[f'{col1}_minus_{col2}'] = df[col1] - df[col2]
    
    # =============================
    # TREND VE MOMENTUM ÖZELLİKLERİ
    # =============================
    
    
    # Trend kolonları
    trend_cols = [col for col in numeric_cols if 'trend' in col or 'growth' in col or 'momentum' in col]
    if len(trend_cols) > 0:
        df['positive_trends'] = (df[trend_cols] > 0).sum(axis=1)
        df['negative_trends'] = (df[trend_cols] < 0).sum(axis=1)
        df['positive_trend_ratio'] = df['positive_trends'] / len(trend_cols)
        df['trend_volatility'] = df[trend_cols].std(axis=1)
    
    # Last vs first karşılaştırmaları
    last_cols = [col for col in numeric_cols if 'last' in col]
    first_cols = [col for col in numeric_cols if 'first' in col]
    if len(last_cols) > 0 and len(first_cols) > 0:
        df['last_values_mean'] = df[last_cols].mean(axis=1)
        df['first_values_mean'] = df[first_cols].mean(axis=1)
        df['last_to_first_ratio'] = df['last_values_mean'] / (df['first_values_mean'].abs() + 1e-5)
    
    # =============================
    # AKTİVİTE VE ENGAGEMENT SKORLARı
    # =============================
    
    
    # Sıfır aktivite oranları
    zero_cols = [col for col in df.columns if 'months_zero' in col or 'zero_activity' in col]
    if len(zero_cols) > 0:
        df['total_zero_activity_score'] = df[zero_cols].sum(axis=1)
        df['avg_zero_activity'] = df[zero_cols].mean(axis=1)
    
    # Activity ratio kolonları
    activity_cols = [col for col in df.columns if 'activity_ratio' in col]
    if len(activity_cols) > 0:
        df['combined_activity_score'] = df[activity_cols].mean(axis=1)
        df['min_activity_score'] = df[activity_cols].min(axis=1)
        df['max_activity_score'] = df[activity_cols].max(axis=1)
    
    # =============================
    # RİSK VE İSTİKRAR SKORLARı
    # =============================
    
    
    # Volatilite skorları
    volatility_cols = [col for col in df.columns if 'volatility' in col or 'cv' in col or '_std' in col]
    if len(volatility_cols) > 0:
        df['total_volatility_score'] = df[volatility_cols].mean(axis=1)
        df['max_volatility'] = df[volatility_cols].max(axis=1)
        df['min_volatility'] = df[volatility_cols].min(axis=1)
    
    # Composite risk score
    if 'total_volatility_score' in df.columns and 'total_zero_activity_score' in df.columns:
        df['composite_risk_score'] = (df['total_volatility_score'] * 0.5 + 
                                      df['total_zero_activity_score'] * 0.3 +
                                      df['negative_ratio'] * 0.2)
    
    # =============================
    # ZAMAN BAZLI ÖZELLİKLER
    # =============================
   
    
    # Son dönem özellikleri
    last_3m_cols = [col for col in df.columns if 'last_3m' in col]
    last_6m_cols = [col for col in df.columns if 'last_6m' in col]
    
    if len(last_3m_cols) > 0:
        df['last_3m_total_activity'] = df[last_3m_cols].sum(axis=1)
        df['last_3m_avg_activity'] = df[last_3m_cols].mean(axis=1)
    
    if len(last_6m_cols) > 0:
        df['last_6m_total_activity'] = df[last_6m_cols].sum(axis=1)
        df['last_6m_avg_activity'] = df[last_6m_cols].mean(axis=1)
    
    if 'last_3m_total_activity' in df.columns and 'last_6m_total_activity' in df.columns:
        df['recent_activity_acceleration'] = df['last_3m_total_activity'] / (df['last_6m_total_activity'] + 1e-5)
    
    # =============================
    # DEMOGRAFİK KOMBİNASYONLAR
    # =============================
    
    
    # Age ile diğer özelliklerin kombinasyonları
    if 'age' in df.columns:
        if 'tenure' in df.columns:
            df['age_tenure_product'] = df['age'] * df['tenure']
            df['age_tenure_sum'] = df['age'] + df['tenure']
            df['age_tenure_diff'] = df['age'] - df['tenure']
        
        if 'total_transaction_amt' in df.columns:
            df['age_per_transaction'] = df['age'] / (df['total_transaction_amt'] + 1)
            df['transaction_per_age'] = df['total_transaction_amt'] / df['age']
        
        if 'active_product_mean' in df.columns:
            df['age_product_interaction'] = df['age'] * df['active_product_mean']
    
    # Tenure ile aktivite kombinasyonları
    if 'tenure' in df.columns and 'total_transaction_cnt' in df.columns:
        df['transactions_per_tenure_month'] = df['total_transaction_cnt'] / (df['tenure'] + 1)
        df['tenure_efficiency'] = df['tenure'] / (df['total_transaction_cnt'] + 1)
    
    # =============================
    # AGGREGASYoN ÖZELLİKLERİ
    # =============================
   
    
    # Sum, mean, count özellikleri
    sum_cols = [col for col in df.columns if '_sum' in col]
    mean_cols = [col for col in df.columns if '_mean' in col]
    cnt_cols = [col for col in df.columns if '_cnt' in col]
    
    if len(sum_cols) > 0:
        df['all_sums_total'] = df[sum_cols].sum(axis=1)
        df['all_sums_mean'] = df[sum_cols].mean(axis=1)
        df['all_sums_std'] = df[sum_cols].std(axis=1)
    
    if len(mean_cols) > 0:
        df['all_means_avg'] = df[mean_cols].mean(axis=1)
        df['all_means_std'] = df[mean_cols].std(axis=1)
    
    if len(cnt_cols) > 0:
        df['all_counts_total'] = df[cnt_cols].sum(axis=1)
        df['all_counts_mean'] = df[cnt_cols].mean(axis=1)
    
    # =============================
    # FREQUENCY VE COUNT ENCODING ÖZELLİKLERİ
    # =============================
    
    
    freq_cols = [col for col in df.columns if 'frequency' in col]
    count_cols = [col for col in df.columns if 'count' in col and 'transaction' not in col]
    
    if len(freq_cols) > 0:
        df['avg_frequency_score'] = df[freq_cols].mean(axis=1)
        df['min_frequency'] = df[freq_cols].min(axis=1)
    
    # =============================
    # AYKIRI DEĞER ÖZELLİKLERİ
    # =============================
   
    
    # Her önemli kolon için aykırı değer tespiti
    for col in important_cols[:30]:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        df[f'{col}_is_outlier'] = ((df[col] < lower) | (df[col] > upper)).astype(int)
        df[f'{col}_outlier_distance'] = np.maximum(0, np.maximum(lower - df[col], df[col] - upper))
    
    outlier_indicator_cols = [col for col in df.columns if '_is_outlier' in col]
    if len(outlier_indicator_cols) > 0:
        df['total_outliers'] = df[outlier_indicator_cols].sum(axis=1)
        df['outlier_ratio'] = df['total_outliers'] / len(outlier_indicator_cols)
    
    # =============================
    # İSTATİSTİKSEL TESTLER
    # =============================
   
    
    # Harmonik ve geometrik ortalama
    positive_values = df[numeric_cols].clip(lower=0.001)
    df['row_harmonic_mean'] = len(numeric_cols) / (1 / positive_values).sum(axis=1)
    df['row_geometric_mean'] = np.exp(np.log(positive_values).mean(axis=1))
    
    # Variation coefficient
    df['row_variation_coef'] = df['row_std'] / (df['row_mean'].abs() + 1e-5)
    
    # =============================
    # NORMALIZE EDİLMİŞ ÖZELLİKLER
    # =============================
    
    
    # Min-max normalization for key features
    key_features = ['total_transaction_amt', 'total_transaction_cnt', 'age', 'tenure']
    for feat in key_features:
        if feat in df.columns:
            feat_min = df[feat].min()
            feat_max = df[feat].max()
            df[f'{feat}_normalized'] = (df[feat] - feat_min) / (feat_max - feat_min + 1e-5)
    
    # Z-score for key features
    for feat in key_features:
        if feat in df.columns:
            feat_mean = df[feat].mean()
            feat_std = df[feat].std()
            df[f'{feat}_zscore'] = (df[feat] - feat_mean) / (feat_std + 1e-5)
    
    # =============================
    # BÖLGESEL VE KATEGORİK ETKİLEŞİMLER
    # =============================
 
    
    encoded_cols = [col for col in df.columns if '_encoded' in col]
    for enc_col in encoded_cols[:5]:  # İlk 5 encoded kolon
        if 'total_transaction_amt' in df.columns:
            df[f'{enc_col}_x_transaction_amt'] = df[enc_col] * df['total_transaction_amt']
        if 'active_product_mean' in df.columns:
            df[f'{enc_col}_x_product'] = df[enc_col] * df['active_product_mean']
    
    # =============================
    # ÖZEL CHURN RİSK İNDİKATÖRLERİ
    # =============================
   
    
    # Azalan aktivite riski
    if 'mobile_eft_cnt_trend' in df.columns and 'cc_transaction_cnt_trend' in df.columns:
        df['declining_activity_risk'] = (
            (df['mobile_eft_cnt_trend'] < 0).astype(int) +
            (df['cc_transaction_cnt_trend'] < 0).astype(int)
        )
    
    # Düşük son dönem aktivitesi
    if 'last_3m_total_activity' in df.columns and 'total_transaction_cnt' in df.columns:
        df['low_recent_activity'] = (df['last_3m_total_activity'] < df['total_transaction_cnt'] * 0.2).astype(int)
    
    # Azalan ürün kullanımı
    if 'active_product_trend' in df.columns:
        df['decreasing_product_usage'] = (df['active_product_trend'] < 0).astype(int)
    
    # Yüksek volatilite + düşük aktivite
    if 'total_volatility_score' in df.columns and 'combined_activity_score' in df.columns:
        df['high_risk_combo'] = (
            (df['total_volatility_score'] > df['total_volatility_score'].quantile(0.75)) &
            (df['combined_activity_score'] < df['combined_activity_score'].quantile(0.25))
        ).astype(int)
    
    # Composite churn risk score
    risk_indicators = [
        'declining_activity_risk', 'low_recent_activity', 
        'decreasing_product_usage', 'high_risk_combo'
    ]
    available_risk_indicators = [col for col in risk_indicators if col in df.columns]
    if len(available_risk_indicators) > 0:
        df['composite_churn_risk'] = df[available_risk_indicators].sum(axis=1)
    
    # =============================
    # TEMİZLEME
    # =============================
    
    df = df.replace([np.inf, -np.inf], np.nan)
    
    for col in df.columns:
        if col not in exclude_cols and df[col].dtype in [np.float64, np.float32, np.int64, np.int32]:
            if df[col].isnull().any():
                df[col] = df[col].fillna(df[col].median())
    
    print(f"Toplam özellik sayısı: {df.shape[1]}")
    
    return df

In [38]:
X_train_enhanced = ultra_advanced_feature_engineering(train_merged)

print(f"Train seti feature engineering tamamlandı: {X_train_enhanced.shape}")

  Mevcut numeric kolon sayısı: 145
  ✓ Toplam özellik sayısı: 500
Train seti feature engineering tamamlandı: (133287, 500)


In [40]:
X_test_enhanced = ultra_advanced_feature_engineering(test_merged)

print(f"Test seti feature engineering tamamlandı: {X_test_enhanced.shape}")

  Mevcut numeric kolon sayısı: 145
  ✓ Toplam özellik sayısı: 499
Test seti feature engineering tamamlandı: (43006, 499)


In [37]:
print("Final Verisi")

# Target ve ID ayırma
y_train = X_train_enhanced['churn']
train_ids = X_train_enhanced['cust_id']
X_train_final = X_train_enhanced.drop(['churn', 'cust_id'], axis=1, errors='ignore')

test_ids = X_test_enhanced['cust_id']
X_test_final = X_test_enhanced.drop(['cust_id'], axis=1, errors='ignore')

# Ortak kolonları al
common_cols = X_train_final.columns.intersection(X_test_final.columns)
X_train_final = X_train_final[common_cols]
X_test_final = X_test_final[common_cols]

print(f"X_train shape: {X_train_final.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test_final.shape}")
print(f"\nTarget distribution:")
print(y_train.value_counts(normalize=True))

Final Verisi
X_train shape: (133287, 498)
y_train shape: (133287,)
X_test shape: (43006, 498)

Target distribution:
churn
0    0.858426
1    0.141574
Name: proportion, dtype: float64


In [36]:
model_params = {
    'lgbm': {
        'n_estimators': 3000,
        'learning_rate': 0.003,
        'max_depth': 9,
        'num_leaves': 127,
        'min_child_samples': 20,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'reg_alpha': 1.5,
        'reg_lambda': 1.5,
        'min_split_gain': 0.01,
        'class_weight': 'balanced',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'feature_fraction': 0.7,
        'bagging_freq': 5
    },
    'xgb': {
        'n_estimators': 3000,
        'learning_rate': 0.003,
        'max_depth': 9,
        'min_child_weight': 3,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'reg_alpha': 1.5,
        'reg_lambda': 1.5,
        'gamma': 0.1,
        'scale_pos_weight': 3,
        'eval_metric': 'auc',
        'tree_method': 'hist'
    },
    'catboost': {
        'iterations': 3000,
        'learning_rate': 0.003,
        'depth': 9,
        'l2_leaf_reg': 7,
        'border_count': 128,
        'auto_class_weights': 'Balanced',
        'bootstrap_type': 'Bayesian',
        'bagging_temperature': 0.5,
        'random_strength': 0.5,
        'eval_metric': 'AUC',
        'od_type': 'Iter',
        'od_wait': 100
    }
}

In [31]:
def train_single_model(model_name, X_train, y_train, X_val, y_val):
    
    if model_name == 'lgbm':
        model = LGBMClassifier(**model_params['lgbm'], random_state=42, verbose=-1)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[
                early_stopping(150, verbose=False),
                log_evaluation(0)
            ]
        )
        
    elif model_name == 'xgb':
        model = XGBClassifier(**model_params['xgb'], random_state=42)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        
    elif model_name == 'catboost':
        model = CatBoostClassifier(**model_params['catboost'], random_state=42, verbose=0)
        model.fit(
            X_train, y_train,
            eval_set=(X_val, y_val),
            verbose=False
        )
    
    return model

def optimize_ensemble_weights(predictions, y_true):
    def objective(weights):
        weights = weights / weights.sum()
        ensemble_pred = np.average(predictions, axis=0, weights=weights)
        return -roc_auc_score(y_true, ensemble_pred)
    
    n_models = len(predictions)
    initial_weights = np.ones(n_models) / n_models
    bounds = [(0, 1) for _ in range(n_models)]
    constraints = {'type': 'eq', 'fun': lambda w: w.sum() - 1}
    
    result = minimize(
        objective,
        initial_weights,
        method='SLSQP',
        bounds=bounds,
        constraints=constraints,
        options={'maxiter': 1000}
    )
    
    return result.x / result.x.sum()


In [41]:
def train_ensemble_cv(X, y, X_test, n_splits=10):
    """Cross-validation ile ensemble eğitimi"""
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    model_names = ['lgbm', 'xgb', 'catboost']
    
    # Tahmin kayıtları
    oof_predictions = {name: np.zeros(len(X)) for name in model_names}
    test_predictions = {name: np.zeros(len(X_test)) for name in model_names}
    
    print(f"CROSS-VALIDATION EĞİTİMİ ({n_splits} Fold)")
    
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"\n{'─'*80}")
        print(f"FOLD {fold}/{n_splits}")
        print(f"{'─'*80}")
        
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        # Özellik ölçeklendirme
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(
            scaler.fit_transform(X_train_fold),
            columns=X_train_fold.columns,
            index=X_train_fold.index
        )
        X_val_scaled = pd.DataFrame(
            scaler.transform(X_val_fold),
            columns=X_val_fold.columns,
            index=X_val_fold.index
        )
        X_test_scaled = pd.DataFrame(
            scaler.transform(X_test),
            columns=X_test.columns,
            index=X_test.index
        )
        
        # Her model için eğitim
        fold_aucs = []
        for model_name in model_names:
            print(f"  Eğitiliyor: {model_name.upper()}...", end=' ')
            
            model = train_single_model(
                model_name, 
                X_train_scaled, y_train_fold, 
                X_val_scaled, y_val_fold
            )
            
            # Tahminler
            val_pred = model.predict_proba(X_val_scaled)[:, 1]
            test_pred = model.predict_proba(X_test_scaled)[:, 1]
            
            # Kaydet
            oof_predictions[model_name][val_idx] = val_pred
            test_predictions[model_name] += test_pred / n_splits
            
            # AUC hesapla
            fold_auc = roc_auc_score(y_val_fold, val_pred)
            fold_aucs.append(fold_auc)
            print(f"AUC: {fold_auc:.5f}")
        
        print(f"  Fold ortalama AUC: {np.mean(fold_aucs):.5f}")
    
    # Bireysel model OOF skorları
    print("BİREYSEL MODEL PERFORMANSLARI")
    
    
    individual_aucs = []
    for model_name in model_names:
        oof_auc = roc_auc_score(y, oof_predictions[model_name])
        individual_aucs.append(oof_auc)
        print(f"{model_name.upper():12s} OOF AUC: {oof_auc:.5f}")
    
    # Ensemble ağırlıklarını optimize et
    print("ENSEMBLE AĞIRLIK OPTİMİZASYONU")
   
    
    predictions_array = np.array([oof_predictions[name] for name in model_names])
    optimal_weights = optimize_ensemble_weights(predictions_array, y.values)
    
    print("\nOptimal ağırlıklar:")
    for name, weight in zip(model_names, optimal_weights):
        print(f"  {name.upper():12s}: {weight:.4f}")
    
    # Ensemble OOF skoru
    ensemble_oof = np.average(predictions_array, axis=0, weights=optimal_weights)
    oof_score = roc_auc_score(y, ensemble_oof)
    
    # Test tahminleri
    test_preds_array = np.array([test_predictions[name] for name in model_names])
    final_test_predictions = np.average(test_preds_array, axis=0, weights=optimal_weights)
    
    return final_test_predictions, optimal_weights, oof_score, individual_aucs

In [25]:
print("ENSEMBLE MODEL EĞİTİMİ BAŞLIYOR")


test_predictions, final_weights, oof_score, individual_aucs = train_ensemble_cv(
    X_train_final, 
    y_train, 
    X_test_final, 
    n_splits=10
)

print("Model eğitimi tamamlandı!")

ENSEMBLE MODEL EĞİTİMİ BAŞLIYOR
CROSS-VALIDATION EĞİTİMİ (10 Fold)

────────────────────────────────────────────────────────────────────────────────
FOLD 1/10
────────────────────────────────────────────────────────────────────────────────
  Eğitiliyor: LGBM... AUC: 0.72197
  Eğitiliyor: XGB... AUC: 0.71887
  Eğitiliyor: CATBOOST... AUC: 0.72285
  Fold ortalama AUC: 0.72123

────────────────────────────────────────────────────────────────────────────────
FOLD 2/10
────────────────────────────────────────────────────────────────────────────────
  Eğitiliyor: LGBM... AUC: 0.72139
  Eğitiliyor: XGB... AUC: 0.72249
  Eğitiliyor: CATBOOST... AUC: 0.72327
  Fold ortalama AUC: 0.72238

────────────────────────────────────────────────────────────────────────────────
FOLD 3/10
────────────────────────────────────────────────────────────────────────────────
  Eğitiliyor: LGBM... AUC: 0.71792
  Eğitiliyor: XGB... AUC: 0.71774
  Eğitiliyor: CATBOOST... AUC: 0.71735
  Fold ortalama AUC: 0.71767

──

In [28]:
print("FİNAL SONUÇLAR")


print(f"Bireysel Model AUC Skorları:")
print(f"  LGBM:     {individual_aucs[0]:.5f}")
print(f"  XGBoost:  {individual_aucs[1]:.5f}")
print(f"  CatBoost: {individual_aucs[2]:.5f}")
print(f"  Ortalama: {np.mean(individual_aucs):.5f}")

print(f"\n{'─'*80}")
print(f"ENSEMBLE OOF AUC: {oof_score:.5f}")
print(f"{'─'*80}")

improvement = oof_score - np.mean(individual_aucs)
print(f"Ensemble iyileştirmesi: +{improvement:.5f} ({improvement*100:.2f}%)")

print(f"\nOptimal Ensemble Ağırlıkları:")
print(f"  LGBM:     {final_weights[0]:.4f}")
print(f"  XGBoost:  {final_weights[1]:.4f}")
print(f"  CatBoost: {final_weights[2]:.4f}")

FİNAL SONUÇLAR
Bireysel Model AUC Skorları:
  LGBM:     0.70827
  XGBoost:  0.71773
  CatBoost: 0.72020
  Ortalama: 0.71540

────────────────────────────────────────────────────────────────────────────────
ENSEMBLE OOF AUC: 0.71945
────────────────────────────────────────────────────────────────────────────────
Ensemble iyileştirmesi: +0.00405 (0.41%)

Optimal Ensemble Ağırlıkları:
  LGBM:     0.3333
  XGBoost:  0.3334
  CatBoost: 0.3333


In [29]:
print("SUBMISSION OLUŞTURMA")


submission = pd.DataFrame({
    'cust_id': test_ids,
    'churn': test_predictions
})

# Submission istatistikleri
print(f"  Satır sayısı: {len(submission)}")
print(f"  Churn istatistikleri:")
print(f"    Ortalama: {submission['churn'].mean():.5f}")
print(f"    Std:      {submission['churn'].std():.5f}")
print(f"    Min:      {submission['churn'].min():.5f}")
print(f"    Max:      {submission['churn'].max():.5f}")
print(f"    Median:   {submission['churn'].median():.5f}")

submission.to_csv('submission_final.csv', index=False)

print(f"Submission önizleme:")
print(submission.head(10))

SUBMISSION OLUŞTURMA
  Satır sayısı: 43006
  Churn istatistikleri:
    Ortalama: 0.34944
    Std:      0.15811
    Min:      0.04602
    Max:      0.67928
    Median:   0.36382
Submission önizleme:
   cust_id     churn
0        1  0.370022
1        2  0.290134
2        9  0.549835
3       15  0.496713
4       19  0.202485
5       21  0.118094
6       26  0.546639
7       32  0.486587
8       33  0.355934
9       36  0.103002
