In [1]:
import pandas as pd
import numpy as np
import warnings
pd.set_option("display.max_columns",None)
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("whole_train.csv")
test = pd.read_csv("whole_test.csv")

In [3]:
x = train.drop(columns=["individualnumber","response"],axis=1)
y = train["response"]

# Cross Validation

__``Veri seti dengesiz olduğu için cross validation esnasında seçilecek splitlerin rastgele olması ve tüm hedef değişkenlerini kapsaması adına Stratified KFold seçtim``__

In [4]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=24)

__``Yarışma değerlendirmesi F1 Score üzerinden yapılacağı için, cross validation esnasında F1 score baz alınmasını istedim``__

In [5]:
def stratifiedKFold_on_f1(model,x,y):
    scores = cross_val_score(model, x, y, cv=skf, n_jobs=-1, scoring="f1")
    print("Model : {}".format(model))
    print("f1 scores",scores)
    print(f"f1 scores mean: {np.mean(scores)} , f1 scores std :{np.std(scores)}" )
    print("\n\n")
    

<img src="lazypredict.jpg" width="600" />

__``Pek dikkate alınmaması gereken sonuçlar vermemesine rağmen, yarışma esnasında çalışırken LazyPredict modülünü kullandım ve fikir olması adına training seti üzerinde tabloda gözüktüğü üzere farklı modeller denedim. Gözüme çarpanları seçerek KFold esnasında gözlemlemek üzere import edip kurdum.``__

In [6]:
from sklearn.linear_model import Perceptron
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier ,AdaBoostClassifier,ExtraTreesClassifier,BaggingClassifier,GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression,SGDClassifier,PassiveAggressiveClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC,LinearSVC

In [7]:
perceptron = Perceptron(random_state=0)
xgb = XGBClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
ada = AdaBoostClassifier()
etc = ExtraTreesClassifier()
bagging = BaggingClassifier()
gbc = GradientBoostingClassifier()
calibrated = CalibratedClassifierCV()
bnb = BernoulliNB()
nc = NearestCentroid()
gnb = GaussianNB()
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
logreg = LogisticRegression()
sgdc = SGDClassifier()
pac = PassiveAggressiveClassifier()
lgbm = LGBMClassifier()
svc = SVC()
linear_svc = LinearSVC()

In [8]:
models = [perceptron,xgb,dt,rf,ada,etc,bagging,gbc,calibrated,bnb,nc,gnb,lda,qda,logreg,sgdc,pac,lgbm,svc,linear_svc]

In [9]:
for model in models:
    stratifiedKFold_on_f1(model,x,y)

Model : Perceptron()
f1 scores [0.         0.07407407 0.30508475 0.31111111 0.41666667 0.
 0.13333333 0.15384615 0.28571429 0.12903226]
f1 scores mean: 0.18088626285728523 , f1 scores std :0.13472004220741127



Model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
f1 scores [0.23076923 0.32  

# Feature Selection

__``Sınıflandırma problemlerine yönelik bir modül olan mutual info modülünü kullandım``__

In [10]:
from sklearn.feature_selection import mutual_info_classif

In [15]:
mutual = mutual_info_classif(x,y)
mutual

array([0.00152305, 0.00686721, 0.00639   , 0.0008186 , 0.00152156,
       0.00310915, 0.01242238, 0.0070982 , 0.01456927, 0.00053134,
       0.00387522, 0.0108344 , 0.00587408, 0.01402303, 0.00574294,
       0.03091884, 0.01287099, 0.00578028, 0.01748861, 0.01284198,
       0.00399221, 0.01801207, 0.01261719, 0.01135225, 0.00808792,
       0.00958163, 0.01062926, 0.00908126, 0.01321864, 0.00728777,
       0.01278694, 0.01272984])

In [16]:
mutual = pd.Series(mutual)
mutual.index = x.columns
mutual.sort_values(ascending=False)

discount_type_1_sum            0.030919
discount_type_3_sum            0.018012
discount_type_2_sum            0.017489
amount_sum                     0.014569
quantity_sum                   0.014023
is_sanal_size                  0.013219
discount_type_1_size           0.012871
discount_type_2_size           0.012842
basketid_nunique               0.012787
date_of_transaction_nunique    0.012730
discount_type_3_size           0.012617
amount_max                     0.012422
category_level_1_nunique       0.011352
quantity_max                   0.010834
category_level_4_nunique       0.010629
category_level_3_nunique       0.009582
is_sanal_mean                  0.009081
category_level_2_nunique       0.008088
cardnumber_nunique             0.007288
amount_min                     0.007098
hakkedis_amt                   0.006867
odul_amt                       0.006390
quantity_min                   0.005874
discount_type_2_mean           0.005780
quantity_median                0.005743


In [17]:
# 0.001 ve altındaki featureları çıkardım
x.drop(columns=["amount_median","age","gender","category_number"],inplace=True)

In [18]:
for model in models:
    stratifiedKFold_on_f1(model,x,y)

Model : Perceptron()
f1 scores [0.25806452 0.07407407 0.18181818 0.23529412 0.32       0.06896552
 0.23809524 0.2        0.26666667 0.32653061]
f1 scores mean: 0.2169508923916529 , f1 scores std :0.08450294414670585



Model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
f1 scores [0.28571429

In [19]:
# 0.005 altındaki featureları çıkardım
x.drop(columns=["discount_type_3_mean","quantity_mean","amount_mean"],inplace=True)

In [20]:
for model in models:
    stratifiedKFold_on_f1(model,x,y)

Model : Perceptron()
f1 scores [0.27586207 0.07142857 0.21428571 0.07407407 0.29457364 0.06060606
 0.26415094 0.14814815 0.25       0.23255814]
f1 scores mean: 0.18856873638500485 , f1 scores std :0.08705123963738606



Model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
f1 scores [0.2222222

In [21]:
#0.005 olan featureları çıkardım
x.drop(columns=["hakkedis_amt",
                "quantity_median",
                "discount_type_2_mean",
                "cardnumber_nunique",
                "odul_amt"],axis=1,inplace=True)

In [22]:
for model in models:
    stratifiedKFold_on_f1(model,x,y)

Model : Perceptron()
f1 scores [0.25862069 0.32941176 0.19354839 0.16216216 0.34666667 0.0625
 0.26829268 0.125      0.21428571 0.14285714]
f1 scores mean: 0.21033452103563444 , f1 scores std :0.08646660137789167



Model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
f1 scores [0.22222222 0.

In [23]:
#0.01 altındaki featureları da çıkardım
x.drop(columns=["category_level_3_nunique",
                "is_sanal_mean",
                "category_level_2_nunique",
                "amount_min",
                "quantity_min"],axis=1,inplace=True)

In [24]:
for model in models:
    stratifiedKFold_on_f1(model,x,y)

Model : Perceptron()
f1 scores [0.24793388 0.0625     0.21875    0.         0.32183908 0.17391304
 0.17021277 0.18404908 0.28571429 0.15      ]
f1 scores mean: 0.18149121396618853 , f1 scores std :0.09200198852814787



Model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
f1 scores [0.3076923

In [25]:
x

Unnamed: 0,amount_max,amount_sum,quantity_max,quantity_sum,discount_type_1_sum,discount_type_1_size,discount_type_2_sum,discount_type_2_size,discount_type_3_sum,discount_type_3_size,category_level_1_nunique,category_level_4_nunique,is_sanal_size,basketid_nunique,date_of_transaction_nunique
0,149.90,6188.54,7.755,467.009,47.772341,393.0,172.87,393.0,220.81,393.0,39.0,25.0,64.0,64.0,59.0
1,2899.90,6756.74,5.000,189.397,103.830001,138.0,619.24,138.0,1298.71,138.0,25.0,20.0,30.0,30.0,30.0
2,1999.00,24681.61,10.000,2371.936,1825.610969,1709.0,995.65,1709.0,2549.79,1709.0,40.0,30.0,518.0,518.0,268.0
3,192.78,7376.14,6.000,434.464,22.772566,287.0,39.15,287.0,271.04,287.0,22.0,20.0,40.0,40.0,30.0
4,29.95,256.80,5.000,46.574,0.000000,24.0,0.00,24.0,10.21,24.0,11.0,6.0,4.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13097,300.00,5979.13,24.000,1044.585,198.249222,893.0,338.72,893.0,1472.60,893.0,41.0,29.0,506.0,506.0,275.0
13098,269.50,6257.18,10.000,331.204,165.100000,271.0,121.07,271.0,337.22,271.0,31.0,21.0,40.0,40.0,35.0
13099,77.90,1831.70,4.000,161.742,26.851917,142.0,159.44,142.0,305.61,142.0,23.0,15.0,39.0,39.0,34.0
13100,68.50,738.55,4.000,74.445,0.000000,53.0,24.38,53.0,7.18,53.0,20.0,9.0,11.0,11.0,11.0


In [26]:
y

0        0
1        0
2        0
3        0
4        0
        ..
13097    0
13098    0
13099    0
13100    0
13101    0
Name: response, Length: 13102, dtype: int64

# Hyperparameter Tuning

__``GridSearch ve RandomizedSearch ile seçtiğim bir kaç model için en iyi hiperparametre tespitini yapacağım``__

__``Bazı modeller ile GridSearchCV algoritmasının en iyi hiperparametreleri bulması çok uzun sürdüğünden, onları daha hızlı çalışan RandomizedSearchCV algoritması ile kullanacağım, iki modülü de yükledim.``__

In [27]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

__``Seçilen modellere ait hyperparameter gridleri``__

In [28]:
xgb_grid = {'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
            'max_depth': [3,4,5,6,8,10,12,15],
            'min_child_weight' : [1,3,5,7],
            'gamma': [0,0.1,0.2,0,3,0.4],
            'colsample_bytree' : [0.3, 0.4, 0.5, 0.7]}

dt_grid = {"criterion" : ["gini","entropy"],"max_depth" : [2,4,6,8,10,12]}

rf_grid = {'bootstrap': [True, False],
           'max_depth': [10, 20, 30, 40],
           'max_features': ['auto', 'sqrt'],
           'min_samples_leaf': [1, 2],
           'min_samples_split': [2, 5],
           'n_estimators': [100,200,300,400]}

lda_grid = {"solver" : ["svd", "lsqr", "eigen"], "shrinkage" : ["auto",None,np.arange(0, 1.01, 0.01)]}

ada_grid = {"n_estimators" : [10, 50, 100, 200,300], "learning_rate" : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]}

lgbm_grid = {"n_estimators" : [100,200,300], "learning_rate" : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
             "num_leaves" : [20, 50, 80, 100]}

In [29]:
gridcv_models_grid = [dt_grid,lda_grid]

randomcv_models_grid = [xgb_grid,rf_grid,ada_grid,lgbm_grid]

models_gridcv = [dt, lda]

models_randomcv = [xgb,rf,ada,lgbm]

In [30]:
def GridCV_tuner(model,grid,x,y):
    gridcv = GridSearchCV(model, grid, scoring='f1', cv=skf, n_jobs=-1)
    gridcv_results = gridcv.fit(x,y)
    print("Model : {}".format(model))
    print('Best Score: %.3f' % gridcv_results.best_score_)
    print('Config: %s' % gridcv_results.best_params_)
    print("\n\n")

In [31]:
def RandomizedCV_tuner(model,grid,x,y):
    randomcv = RandomizedSearchCV(model,grid,n_iter=10,scoring='f1',n_jobs=-1,cv=skf)
    randomcv_results = randomcv.fit(x,y)
    print("Model : {}".format(model))
    print('Best Score: %.3f' % randomcv_results.best_score_)
    print('Config: %s' % randomcv_results.best_params_)
    print("\n\n")

In [32]:
for model, grid in zip(models_gridcv,gridcv_models_grid):
    GridCV_tuner(model,grid,x,y)

Model : DecisionTreeClassifier()
Best Score: 0.265
Config: {'criterion': 'entropy', 'max_depth': 8}



Model : LinearDiscriminantAnalysis()
Best Score: 0.417
Config: {'shrinkage': 'auto', 'solver': 'lsqr'}





In [34]:
for model, grid in zip(models_randomcv,randomcv_models_grid):
    RandomizedCV_tuner(model,grid,x,y)

Model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
Best Score: 0.253
Config: {'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.3, 'gamma': 3, 'colsample_bytree': 0.4}



Model : RandomForestClassifier()
Best Score: 0.213
Config: {'n_estimators': 300, 'min_samples_split': 5, 'min_sa

__``AdaBoost algoritmasının feature selectiondan önce daha iyi skor vermesi üzerine ona özel olarak girdi hazırlamayı tercih ettim.``__

In [37]:
x_ada = train.drop(columns=["individualnumber","response"],axis=1)

In [38]:
RandomizedCV_tuner(ada,ada_grid,x_ada,y)

Model : AdaBoostClassifier()
Best Score: 0.270
Config: {'n_estimators': 300, 'learning_rate': 0.3}





In [40]:
from sklearn.preprocessing import StandardScaler
z_score = StandardScaler()

In [41]:
x_scaled = z_score.fit_transform(x)
x_ada_scaled = z_score.fit_transform(x_ada)

In [42]:
for model, grid in zip(models_gridcv,gridcv_models_grid):
    GridCV_tuner(model,grid,x_scaled,y)

Model : DecisionTreeClassifier()
Best Score: 0.247
Config: {'criterion': 'entropy', 'max_depth': 8}



Model : LinearDiscriminantAnalysis()
Best Score: 0.417
Config: {'shrinkage': 'auto', 'solver': 'lsqr'}





In [44]:
randomcv_models_grid = [xgb_grid,rf_grid,lgbm_grid]
models_randomcv = [xgb,rf,lgbm]


In [43]:
for model, grid in zip(models_randomcv,randomcv_models_grid):
    RandomizedCV_tuner(model,grid,x_scaled,y)

Model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
Best Score: 0.264
Config: {'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.15, 'gamma': 0.1, 'colsample_bytree': 0.4}



Model : RandomForestClassifier()
Best Score: 0.222
Config: {'n_estimators': 200, 'min_samples_split': 2, 'min

In [45]:
RandomizedCV_tuner(ada,ada_grid,x_ada_scaled,y)

Model : AdaBoostClassifier()
Best Score: 0.260
Config: {'n_estimators': 300, 'learning_rate': 0.25}





__``Veri setinin çok dengesiz olması ve aşırı fazlalıkta 0 sonucu bulunmasından ötürü modellerin 0 tahmini yapmaya yatkınlaşacağı konusunu öğrendikten sonra SMOTEtomek modülü ile azınlık olan 1 sonucuna ait noktaları artırmayı hedefledim. Edindiğim bilgilere göre RandomOverSampler algoritması azınlık olan noktaları kopyalarken, SMOTEtomek algoritması kopyalamak yerine noktaların yakın komşuluğunda yeni noktalar üretiyor. Modelin olabildiğince ezberden uzak ve genel bir model olması, olabildiğince az hatalı tahmin ile doğru pozitif ve doğru negatif değerlerinin yüksek olması ve böylelikle daha yüksek bir F1 skoruna sahip olması ihtimalini artırmak adına SMOTEtomek yöntemini kullanmayı tercih ettim.``__

__``Sampling değerini 0.2den fazla seçtiğimde cross validatonda aşırı yüksek sonuçlar geliyor ve modeli overfit olmaya itebileceğini düşünerek bu değeri düşük seçmek istedim fakat daha az seçmek istersem minimum 0.17 seçebiliyorum, algoritma bundan daha az yüzde ile çalışamayacağına dair bir uyarı veriyor, bu sebeple 0.2 seçtim.``__

In [46]:
from imblearn.combine import SMOTETomek

smotetomek = SMOTETomek(sampling_strategy=0.2,random_state=24)

In [47]:
x_smotetomek,y_smotetomek = smotetomek.fit_resample(x_scaled,y)

x_ada_smotetomek,y_ada_smotetomek = smotetomek.fit_resample(x_ada_scaled,y)

In [48]:
for model, grid in zip(models_gridcv,gridcv_models_grid):
    GridCV_tuner(model,grid,x_smotetomek,y_smotetomek)

Model : DecisionTreeClassifier()
Best Score: 0.848
Config: {'criterion': 'gini', 'max_depth': 12}



Model : LinearDiscriminantAnalysis()
Best Score: 0.638
Config: {'shrinkage': 'auto', 'solver': 'lsqr'}





In [49]:
for model, grid in zip(models_randomcv,randomcv_models_grid):
    RandomizedCV_tuner(model,grid,x_smotetomek,y_smotetomek)

Model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
Best Score: 0.958
Config: {'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.25, 'gamma': 0.2, 'colsample_bytree': 0.5}



Model : RandomForestClassifier()
Best Score: 0.946
Config: {'n_estimators': 100, 'min_samples_split': 5, 'mi

In [50]:
RandomizedCV_tuner(ada,ada_grid,x_ada_smotetomek,y_ada_smotetomek)

Model : AdaBoostClassifier()
Best Score: 0.831
Config: {'n_estimators': 200, 'learning_rate': 0.3}





__``Select ettiğimiz featureları test setinden de çıkarıp tahmin için test setini ayarlama ve ölçekleme``__

In [54]:
test_x = test.drop(columns=["individualnumber","amount_median","age","gender","category_number","discount_type_3_mean","quantity_mean","amount_mean",
        "hakkedis_amt", "quantity_median", "discount_type_2_mean", "cardnumber_nunique", "odul_amt",
        "category_level_3_nunique", "is_sanal_mean", "category_level_2_nunique", "amount_min", "quantity_min"])

In [80]:
test_x = z_score.fit_transform(test_x)

In [59]:
test_ada = test.drop(["individualnumber"],axis=1)

In [81]:
test_ada = z_score.fit_transform(test_ada)

__``Her modeli tespit edebildiğim kadarıyla en iyi hiperparametreye sahip olacak şekilde tanımladım.``__

In [82]:
dt_best = DecisionTreeClassifier(criterion = 'gini', max_depth = 12)

lda_best = LinearDiscriminantAnalysis(shrinkage= 'auto', solver='lsqr')

xgb_best = XGBClassifier(min_child_weight = 1, max_depth = 10, learning_rate = 0.25, gamma = 0.2, colsample_bytree = 0.5)

rf_best = RandomForestClassifier(n_estimators= 100, min_samples_split= 5, max_features='sqrt', max_depth= 40, bootstrap= False)

lgbm_best = LGBMClassifier(num_leaves= 50, n_estimators= 300, learning_rate= 0.2)

ada_best = AdaBoostClassifier(n_estimators= 200, learning_rate= 0.3)

In [83]:
sample = pd.read_csv("sample_submission.csv")

__``Her model ile tahminleri gerçekleştirme ve Kaggle sistemine yüklemek üzere sample submission dosyamıza atama``__

In [84]:
dt_best.fit(x_smotetomek,y_smotetomek)

sample["response"] = dt_best.predict(test_x)

In [85]:
sample.to_csv("dt_best_0.2smotetomek.csv",index=False)

In [86]:
lda_best.fit(x_smotetomek,y_smotetomek)

sample["response"] = lda_best.predict(test_x)

In [87]:
sample.to_csv("lda_best_0.2smotetomek.csv",index=False)

In [88]:
xgb_best.fit(x_smotetomek,y_smotetomek)

sample["response"] = xgb_best.predict(test_x)

sample.to_csv("xgb_best_0.2smotetomek.csv",index=False)

In [89]:
rf_best.fit(x_smotetomek,y_smotetomek)

sample["response"] = rf_best.predict(test_x)

sample.to_csv("rf_best_0.2smotetomek.csv",index=False)

In [90]:
lgbm_best.fit(x_smotetomek,y_smotetomek)

sample["response"] = lgbm_best.predict(test_x)

sample.to_csv("lgbm_best_0.2smotetomek.csv",index=False)

In [91]:
ada_best.fit(x_ada_smotetomek,y_ada_smotetomek)

sample["response"] = ada_best.predict(test_ada)

sample.to_csv("ada_best_0.2smotetomek.csv",index=False)

# Yarışma Sonucu

> __``Çok düşük beklentilerle, sadece gerçek verilerle kendimi deneme isteğim ve bu esnada yaşadığım sorunlar sayesinde hem yeni şeyler öğrenir hem de tecrübe kazanırım beklentisi ile girdiğim bu yarışmayı beklediğimden iyi bir sıralama ile tamamladım. 287 yarışmacı ve 186 takım içerisinden bireysel olarak 66. sıraya yerleşmemi sağlayan submissionlarım ektedir:``__

<img src="competition_subs.jpg" width="800" />

> __``XGBClassifier modeli kullanarak ilk yaptığım submission sonucu public 49. sıradaydım fakat yarışma sonunda private liste açıklanınca bu model test setinin geri kalanında daha düşük skor aldı. Öte yandan ise bunun beraberinde kullandığım RandomForestClassifier modeli ile yapılan submissionu da final için seçmem sayesinde sonuç olarak 66. sırada kaldım. Private-public test setleri %50 oranında eşit bölündüğü için, 0.4public + 0.4private skorlar toplamda 0.80 F1 skoru ile RandomForest modelimin çalıştığını gösteriyor denebilir.``__

> __``Öte yandan, çalışmalarımı temize çekmek için yarışma sonrasında yazdığım bu notebooktaki modellerin skorlarını, yarışma bittikten sonra da late submission yapıp skor alabildiğimiz için denemek istedim:``__

<img src="late_subs.jpg" width="800" />



> __``Ek olarak, bu notebook haricinde yarışma sonrası bir çok farklı feature selection veya oversampling kombinasyonları ile denemeler yaptığımda aldığım en yüksek skorları LinearDiscriminantAnalysis modelleri ile yakaladığımı farkettim:``__

<img src="lda_subs.jpg" width="800" />

>__``Sonuç olarak, elimizdeki dengesiz veri setiyle RandomForestClassifier ve LinearDiscriminantAnalysis modellerinin daha iyi iş yaptığı söylenebilir.``__