In [283]:
import numpy as np
import pandas as pd
import time
import re
import os
import copy
import pickle

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, ParameterGrid, cross_val_score, learning_curve, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torch.hub import load_state_dict_from_url

from tqdm.notebook import tqdm
from IPython.display import clear_output
import matplotlib.pyplot as plt

In [379]:
files_path = r'C:\Users\kirill\Documents\PythonScripts\advanced-dls-spring-2021'

In [388]:
def write_result(model, x, file_name, files_path=files_path):
    prediction = model.predict(x)
    result = pd.DataFrame(prediction, index=list(range(1761)), columns=['Churn'])
#     result['Churn'] = result['Churn']
    result.to_csv(f'{files_path}{os.sep}{file_name}.csv', sep=',', index = True, index_label='Id')

In [366]:
df_test.index.values

array([   0,    1,    2, ..., 1758, 1759, 1760], dtype=int64)

In [355]:
df_train = pd.read_csv(files_path+os.path.sep+'train.csv')
df_test = pd.read_csv(files_path+os.path.sep+'test.csv')
df_train

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod,Churn
0,55,19.50,1026.35,Male,0,Yes,Yes,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,0
1,72,25.85,1872.2,Male,0,Yes,No,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),0
2,1,75.90,75.9,Male,0,No,No,Yes,No,Fiber optic,No,No,No,Yes,No,No,Month-to-month,Yes,Electronic check,1
3,32,79.30,2570,Female,1,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Mailed check,0
4,60,115.25,6758.45,Female,0,Yes,Yes,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic),0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,3,30.40,82.15,Male,0,No,No,No,No phone service,DSL,No,No,No,Yes,No,No,Month-to-month,No,Electronic check,0
5278,50,44.45,2188.45,Male,0,Yes,No,No,No phone service,DSL,Yes,No,No,Yes,Yes,No,One year,Yes,Bank transfer (automatic),0
5279,1,55.05,55.05,Male,0,No,No,Yes,No,DSL,No,No,Yes,Yes,No,No,Month-to-month,No,Mailed check,0
5280,29,76.00,2215.25,Female,0,No,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Credit card (automatic),0


In [356]:
df_train['Churn'].value_counts()

0    3898
1    1384
Name: Churn, dtype: int64

In [360]:
df_train.columns

Index(['ClientPeriod', 'MonthlySpending', 'TotalSpent', 'Sex',
       'IsSeniorCitizen', 'HasPartner', 'HasChild', 'HasPhoneService',
       'HasMultiplePhoneNumbers', 'HasInternetService',
       'HasOnlineSecurityService', 'HasOnlineBackup', 'HasDeviceProtection',
       'HasTechSupportAccess', 'HasOnlineTV', 'HasMovieSubscription',
       'HasContractPhone', 'IsBillingPaperless', 'PaymentMethod', 'Churn'],
      dtype='object')

In [361]:
#fillna
df_train['TotalSpent'] = df_train['TotalSpent'].map(lambda x: x.replace(' ', '0'))
df_train['TotalSpent'] = df_train['TotalSpent'].astype('float')
df_test['TotalSpent'] = df_test['TotalSpent'].map(lambda x: x.replace(' ', '0'))
df_test['TotalSpent'] = df_test['TotalSpent'].astype('float')

In [362]:
x_train = df_train.values[:, :-1]
y_train = df_train.values[:, -1].reshape(-1, 1).astype(np.int64)
x_test = df_test.values[:, :]

In [227]:
categorial_features = ['ClientPeriod', 'Sex',
       'IsSeniorCitizen', 'HasPartner', 'HasChild', 'HasPhoneService',
       'HasMultiplePhoneNumbers', 'HasInternetService',
       'HasOnlineSecurityService', 'HasOnlineBackup', 'HasDeviceProtection',
       'HasTechSupportAccess', 'HasOnlineTV', 'HasMovieSubscription',
       'HasContractPhone', 'IsBillingPaperless', 'PaymentMethod']
categorial_features = df_train.columns.get_indexer(categorial_features)

numerical_features = ['MonthlySpending', 'TotalSpent']
numerical_features = df_train.columns.get_indexer(numerical_features)

In [228]:
cat_transformer = OneHotEncoder(sparse=False)
num_transformer = StandardScaler()

ct = ColumnTransformer([
    ('cat', cat_transformer, categorial_features),
    ('num', num_transformer, numerical_features)
])

In [235]:
#test
ct.fit_transform(x_train).shape
model = LogisticRegression(solver='saga')
model.fit(ct.fit_transform(x_train), y_train.ravel())
np.mean(model.predict(ct.fit_transform(x_train))==y_train.ravel())



0.810109806891329

In [236]:
pipeline = Pipeline([
    ('transformer', ct),
    ('classifier', LogisticRegression(solver='saga'))
])
cv = KFold(n_splits=3, random_state=42, shuffle=True)
param_grid = [
    {
    'classifier__penalty': ['l2', 'l1'],
    'classifier__C': [0.001, 0.01, 0.1, 0.2, 0.5, 1],},

]
search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-2, verbose=3)

In [239]:
_ = search.fit(x_train, y_train.reshape(-1))

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [240]:
search.best_score_, search.best_params_

(0.8048100252955449, {'classifier__C': 0.1, 'classifier__penalty': 'l1'})

# Default SVC

In [245]:
pipeline = Pipeline([
    ('transformer', ct),
    ('classifier', SVC())
])
cv = KFold(n_splits=5, random_state=42, shuffle=True)
param_grid = {
    'classifier__C': list(np.arange(0.1, 2, 0.2)),
}
search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-1, verbose=3)

In [246]:
_ = search.fit(x_train, y_train.reshape(-1))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [247]:
search.best_score_, search.best_params_

(0.8021605601903616, {'classifier__C': 0.7000000000000001})

# SVC

In [259]:
np.random.seed(42)
pipeline = Pipeline([
    ('transformer', ct),
    ('classifier', SVC())
])
cv = KFold(n_splits=5, random_state=42, shuffle=True)
param_grid = [
    {
    'classifier__kernel': ['poly'],
    'classifier__C': np.arange(0.1, 1, 0.2),
    'classifier__degree': np.arange(2, 8), 
    'classifier__gamma': np.arange(0.001, 0.01, 0.002),},
]
search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-2, verbose=3)

In [260]:
_ = search.fit(x_train, y_train.reshape(-1))

Fitting 5 folds for each of 150 candidates, totalling 750 fits


In [261]:
search.best_score_, search.best_params_

(0.7716733321865772,
 {'classifier__C': 0.9000000000000001,
  'classifier__degree': 2,
  'classifier__gamma': 0.009000000000000001,
  'classifier__kernel': 'poly'})

In [262]:
np.random.seed(42)
pipeline = Pipeline([
    ('transformer', ct),
    ('classifier', SVC())
])
cv = KFold(n_splits=5, random_state=42, shuffle=True)
param_grid = [
    {
    'classifier__kernel': ['rbf'],
    'classifier__C': np.arange(0.1, 2, 0.2),
    'classifier__gamma': np.arange(0.001, 0.02, 0.002),},
]
search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-2, verbose=3)

In [263]:
_ = search.fit(x_train, y_train.reshape(-1))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [264]:
search.best_score_, search.best_params_

(0.8044334666437315,
 {'classifier__C': 1.5000000000000004,
  'classifier__gamma': 0.005,
  'classifier__kernel': 'rbf'})

In [265]:
np.random.seed(42)
pipeline = Pipeline([
    ('transformer', ct),
    ('classifier', SVC())
])
cv = KFold(n_splits=5, random_state=42, shuffle=True)
param_grid = [
    {
    'classifier__kernel': ['sigmoid'],
    'classifier__C': np.arange(0.1, 2, 0.2),
    'classifier__gamma': np.arange(0.001, 0.02, 0.003),},
]
search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-2, verbose=3)

In [266]:
_ = search.fit(x_train, y_train.reshape(-1))

Fitting 5 folds for each of 70 candidates, totalling 350 fits


In [267]:
search.best_score_, search.best_params_

(0.8034875720306184,
 {'classifier__C': 1.3000000000000003,
  'classifier__gamma': 0.019000000000000003,
  'classifier__kernel': 'sigmoid'})

# Forrest

In [273]:
np.arange(50, 300, 50)

array([ 50, 100, 150, 200, 250])

In [276]:
np.random.seed(42)
pipeline = Pipeline([
    ('transformer', ct),
    ('classifier', RandomForestClassifier())])
cv = KFold(n_splits=3, random_state=42, shuffle=True)
param_grid = {
    'classifier__n_estimators': np.arange(50, 300, 50),
    'classifier__max_depth': list(range(4,7)),
    'classifier__max_features': list(range(2,8)),
    'classifier__criterion': ['gini'],
    'classifier__min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],#2 or 8-10
}
search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-2, verbose=1)

In [278]:
_ = search.fit(x_train, y_train.reshape(-1))

Fitting 3 folds for each of 810 candidates, totalling 2430 fits


In [279]:
search.best_score_, search.best_params_

(0.7904200652177654,
 {'classifier__criterion': 'gini',
  'classifier__max_depth': 6,
  'classifier__max_features': 7,
  'classifier__min_samples_split': 2,
  'classifier__n_estimators': 100})

# Xgboost

In [303]:
pipeline = Pipeline([
    ('transformer', ct),
    ('classifier', XGBClassifier(random_state=42, eta=0.1))
])
cv = KFold(n_splits=3, random_state=42, shuffle=True)
param_grid = {
    'classifier__n_estimators': [150],
    'classifier__gamma': [0.1, 1, 2, 5],
    'classifier__min_child_weight': [2],
#     'classifier__colsample_by': [0.5, 0.75, 1],
    'classifier__lambda': [1, 2, 3],#l2
    'classifier__alpha': [2, 5, 10, 20, 50],#l1
    
}
search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=10, verbose=1)

In [304]:
_ = search.fit(x_train, y_train.reshape(-1))

Fitting 3 folds for each of 60 candidates, totalling 180 fits


dict_items([('classifier__n_estimators', [150]), ('classifier__gamma', [0.1, 1, 2, 5]), ('classifier__min_child_weight', [2]), ('classifier__lambda', [1, 2, 3]), ('classifier__alpha', [2, 5, 10, 20, 50])])

In [305]:
search.best_score_, search.best_params_

(0.8017815505996936,
 {'classifier__alpha': 10,
  'classifier__gamma': 5,
  'classifier__lambda': 1,
  'classifier__min_child_weight': 2,
  'classifier__n_estimators': 150})

In [309]:
param_grid.items()

dict_items([('classifier__n_estimators', [150]), ('classifier__gamma', [0.1, 1, 2, 5]), ('classifier__min_child_weight', [2]), ('classifier__lambda', [1, 2, 3]), ('classifier__alpha', [2, 5, 10, 20, 50])])

# Final model

In [384]:
model = XGBClassifier(random_state=42, eta=0.1, n_estimators=200, gamma=5, reg_lambda=2, reg_alpha=10, min_child_weight=2)

In [385]:
model.fit(ct.fit_transform(x_train), y_train.reshape(-1))

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.1,
              eval_metric=None, feature_types=None, gamma=5, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.100000001,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=2,
              missing=nan, monotone_constraints='()', n_estimators=200,
              n_jobs=0, num_parallel_tree=1, predictor='auto', ...)

In [344]:
np.average(model.predict(ct.transform(x_train))==y_train.reshape(-1))

0.8148428625520636

In [386]:
#just test
model.predict(ct.transform(x_test))

array([0, 1, 1, ..., 1, 1, 0])

In [389]:
write_result(model, ct.transform(x_test), 'XGBOOST_result')
!kaggle competitions submit -c advanced-dls-spring-2021 -f {files_path}{os.sep}XGBOOST_result.csv -m "XGBOOST"

In [395]:
#how much give all zeros
result = pd.DataFrame(np.zeros(1761), index=list(range(1761)), columns=['Churn'])
result.to_csv(f'{files_path}{os.sep}zeros.csv', sep=',', index = True, index_label='Id')
!kaggle competitions submit -c advanced-dls-spring-2021 -f {files_path}{os.sep}zeros.csv -m "zeros"