In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
#cv = StratifiedShuffleSplit(n_splits=4, test_size=0.2, random_state=42)
from sklearn.model_selection import cross_val_score

In [None]:
df_train = pd.read_csv('../input/train.csv', na_values="-1")
df_test = pd.read_csv('../input/test.csv', na_values="-1")

y_train = df_train['target']
id_test = df_test['id'].values

nan_count_train = df_train.isnull().sum(axis=1).tolist()
nan_count_test = df_test.isnull().sum(axis=1).tolist()

calc_retr = ['ps_calc_01']

drop_col = [col for col in df_train.columns if '_calc' in col and col not in calc_retr]
drop_col.extend(["id", "ps_car_05_cat"])
print(drop_col)

df_train.drop(drop_col, axis=1, inplace=True)
df_train.drop(['target'], axis=1, inplace=True)
df_test.drop(drop_col, axis=1, inplace=True)

df_train['nan_count'] = nan_count_train
df_test['nan_count'] = nan_count_test

In [None]:
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_normalized(actual, predictions):
    return gini(actual, predictions) / gini(actual, actual)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

from sklearn.metrics import make_scorer
scorer_gini = make_scorer(gini_normalized, greater_is_better=True, needs_proba=True)

In [None]:
bin_cols = [col for col in df_train.columns if 'bin' in col]
cat_cols = [col for col in df_train.columns if '_cat' in col]
con_cols = [col for col in df_train.columns if col not in bin_cols + cat_cols]
print(cat_cols)

#for col in cat_cols:
#    df_train[col].fillna(value=df_train[col].mode()[0], inplace=True)
#    df_test[col].fillna(value=df_test[col].mode()[0], inplace=True)
    
for col in bin_cols:
    df_train[col].fillna(value=df_train[col].mode()[0], inplace=True)
    df_test[col].fillna(value=df_test[col].mode()[0], inplace=True)
    
for col in con_cols:
    df_train[col].fillna(value=df_train[col].mean(), inplace=True)
    df_test[col].fillna(value=df_test[col].mean(), inplace=True)

def make_dummies(columns):
    global df_train
    global df_test
    for i in columns:
        dummies = pd.get_dummies(df_train[i], prefix=i)
        df_train = pd.concat([df_train, dummies], axis=1)
        df_train.drop(i, axis=1, inplace=True)
        
        dummies = pd.get_dummies(df_test[i], prefix=i)
        df_test = pd.concat([df_test, dummies], axis=1)
        df_test.drop(i, axis=1, inplace=True)
        
make_dummies(cat_cols)

In [None]:
#df_train['sum_bin_ind'] = df_train['ps_ind_06_bin']+df_train['ps_ind_07_bin']+df_train['ps_ind_08_bin']+df_train['ps_ind_09_bin']+df_train['ps_ind_10_bin']+df_train['ps_ind_11_bin']+df_train['ps_ind_12_bin']+df_train['ps_ind_13_bin']+df_train['ps_ind_16_bin']+df_train['ps_ind_17_bin']+df_train['ps_ind_18_bin']
#df_test['sum_bin_ind'] = df_test['ps_ind_06_bin']+df_test['ps_ind_07_bin']+df_test['ps_ind_08_bin']+df_test['ps_ind_09_bin']+df_test['ps_ind_10_bin']+df_test['ps_ind_11_bin']+df_test['ps_ind_12_bin']+df_test['ps_ind_13_bin']+df_test['ps_ind_16_bin']+df_test['ps_ind_17_bin']+df_test['ps_ind_18_bin']

#df_train['ps_car_13_x_ps_reg_03'] = df_train['ps_car_13'] * df_train['ps_reg_03']
#df_test['ps_car_13_x_ps_reg_03'] = df_test['ps_car_13'] * df_test['ps_reg_03']

#df_train['sum_car'] = df_train['ps_car_11']+df_train['ps_car_12']+df_train['ps_car_13']+df_train['ps_car_14']+df_train['ps_car_15']
#df_test['sum_car'] = df_test['ps_car_11']+df_test['ps_car_12']+df_test['ps_car_13']+df_test['ps_car_14']+df_test['ps_car_15']

#df_train['sum_reg'] = df_train['ps_reg_01']*df_train['ps_reg_02']*df_train['ps_reg_03']
#df_test['sum_reg'] = df_test['ps_reg_01']*df_test['ps_reg_02']*df_test['ps_reg_03']


print(df_train.shape)
print(df_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
part1 = df_train[:297606]
part1_y = y_train[:297606]
part2 = df_train[297606:]
part2_y = y_train[297606:]

lr= LogisticRegression(C=1.5)
lr.fit(df_train, y_train)
predictions_lr = lr.predict_proba(df_test)
#print(predictions_lr.shape)
#df_test['LR'] = predictions_lr[:,1]

lr1= LogisticRegression(C=1.5)
lr1.fit(part1, part1_y)
#predictions_lr1 = lr1.predict_proba(part2)
#print(predictions_lr1.shape)

lr2= LogisticRegression(C=1.5)
lr2.fit(part2, part2_y)
predictions_lr2 = lr2.predict_proba(part1)
#print(predictions_lr2.shape)
#pred_train = np.append(predictions_lr1[:,1], predictions_lr2[:,1])
#print(pred_train.shape)
#df_train['LR'] = pred_train

In [None]:
print(df_train.shape)
print(df_test.shape)

from sklearn.preprocessing import StandardScaler

indexes_train = df_train.axes
indexes_test = df_test.axes

scaler_tr = StandardScaler()
#scaler_te = StandardScaler()

X_train = pd.DataFrame(scaler_tr.fit_transform(df_train, y_train), index=indexes_train[0], columns=indexes_train[1])
#scaler_te.fit(df_test[col])
X_test = pd.DataFrame(scaler_tr.transform(df_test), index=indexes_test[0], columns=indexes_test[1])

#X_train = df_train
#X_test = df_test

print(X_train.shape)
print(X_test.shape)

X_train.head()

In [None]:
part1 = X_train[:297606]
part1_y = y_train[:297606]
part2 = X_train[297606:]
part2_y = y_train[297606:]

from sklearn.cluster import KMeans
k2 = KMeans(n_clusters=6, n_jobs=2)
k2.fit(X_train)
predictions_k2 = k2.predict(X_test)[None].T
print(predictions_k2.shape)

k2_1 = KMeans(n_clusters=6, n_jobs=2)
k2_1.fit(part1)
predictions_k2_1 = k2_1.predict(part2)[None].T

k2_2 = KMeans(n_clusters=6, n_jobs=2)
k2_2.fit(part2)
predictions_k2_2 = k2_2.predict(part1)[None].T

X_test['k2'] = predictions_k2
pred_train_km = np.append(predictions_k2_1, predictions_k2_2)
print(pred_train_km.shape)
X_train['k2'] = pred_train_km

dummies = pd.get_dummies(X_train['k2'], prefix='k2')
X_train = pd.concat([X_train, dummies], axis=1)
X_train.drop(['k2'], axis=1, inplace=True)
        
dummies = pd.get_dummies(X_test['k2'], prefix='k2')
X_test = pd.concat([X_test, dummies], axis=1)
X_test.drop(['k2'], axis=1, inplace=True)

print(X_train.shape)
print(X_test.shape)
X_train.head()

In [None]:
#scores = cross_val_score(rf1, X_train, y_train, cv=5, scoring=scorer_gini, n_jobs=-1)
#scores
#scores.mean()

#grid = GridSearchCV(rf1, param_grid=param_grid, cv=cv, n_jobs=-1, scoring=scorer_gini, verbose=10)
#grid.fit(X_train, y_train)

#parameters = grid.best_params_
#print('Best score: {}'.format(grid.best_score_))
#print('Best parameters: {}'.format(grid.best_params_))

params={'learning_rate':0.01,'max_depth': 6,'boosting':'gbdt','objective':'binary','metric':'auc',
        'is_training_metric': False,'seed': 99,'feature_fraction': 0.3,'bagging_fraction':0.9,
        'bagging_freq':10,'min_data':600,'num_threads': 2,'scale_pos_weight': 1.1}
from sklearn.model_selection import train_test_split
x1, x2, y1, y2 = train_test_split(X_train, y_train, test_size=0.25, random_state=99)

In [None]:
lgb1=lgb.train(params,lgb.Dataset(x1,label=y1),3000,lgb.Dataset(x2,label=y2),verbose_eval=100,
               feval=gini_lgb,early_stopping_rounds=200)

In [None]:
predictions_lgb = lgb1.predict(X_test, num_iteration=lgb1.best_iteration)

In [None]:
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

sub1 = pd.DataFrame()
sub1['id'] = id_test
sub1['target'] = predictions_lgb

print(sub1.head())

In [None]:
sub1.to_csv('lgb1.csv', index=False, header=True)

In [None]:
pd.DataFrame(predictions_lgb).describe()

In [None]:
title = "Learning Curves (Logistic Regression)"
print(lgb1)

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

lgb.plot_importance(lgb1, max_num_features=20)