In [41]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score,KFold,ShuffleSplit,StratifiedKFold
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier,BaggingClassifier,ExtraTreesClassifier
from sklearn.metrics import r2_score,roc_auc_score,accuracy_score,confusion_matrix
import matplotlib.pyplot as plt
from mlxtend.classifier import StackingCVClassifier
import lightgbm as lgb 
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute

%matplotlib inline
import seaborn as sns
import xgboost as xgb
from scipy.stats import pearsonr

In [42]:
def run(X,y,test,params):
    dtrain=xgb.DMatrix(X,label=y)
    dtest=xgb.DMatrix(test)
    bst=xgb.train(params,dtrain,99)
    preds=bst.predict(dtest)
    return bst,preds

params={
    'objective':'binary:logistic',
    'eta':0.11,
    'max_depth':9,
#     'min_child_weight':4,
#     'gamma':0.1,
#     'subsample':0.8,
#     'colsample_bytree':0.7,
#     'scale_pos_weight':1,
#     'reg_alpha':1e-05
}

In [43]:
pd.set_option('max_columns',100)

In [44]:
train=pd.read_csv('/Users/shashank/Downloads/Societe2/train.csv')
test=pd.read_csv('/Users/shashank/Downloads/Societe2/test.csv')
sample=pd.read_csv('/Users/shashank/Downloads/Societe2/sample_submissions.csv')

In [208]:
train.dropna(axis=0,inplace=True)
test.dropna(axis=0,inplace=True)

In [45]:
train['cat_var_1'].fillna('gf',inplace=True)
test['cat_var_1'].fillna('gf',inplace=True)
train['cat_var_3'].fillna('qt',inplace=True)
test['cat_var_3'].fillna('qt',inplace=True)
test['cat_var_6'].fillna('zs',inplace=True)
train['cat_var_8'].fillna('dn',inplace=True)
test['cat_var_8'].fillna('dn',inplace=True)

In [218]:
train.cat_var_1.nunique()

531

In [220]:
train.cat_var_3.nunique()

469

In [210]:
df=pd.concat([train,test])

In [211]:
to_add=pd.get_dummies(df.loc[:,['cat_var_4','cat_var_5','cat_var_9','cat_var_11','cat_var_12','cat_var_15','cat_var_16','cat_var_17','cat_var_18']])

In [212]:
df=pd.concat([df,to_add],axis=1)

In [213]:
df.drop(['cat_var_4','cat_var_5','cat_var_9','cat_var_11','cat_var_12','cat_var_15','cat_var_16','cat_var_17','cat_var_18'],axis=1,inplace=True)

In [214]:
train=df.iloc[:len(train),:]
test=df.iloc[len(train):,:]

In [49]:
train

Unnamed: 0,cat_var_11,cat_var_12,cat_var_15,cat_var_16,cat_var_17,cat_var_18,cat_var_19,cat_var_20,cat_var_21,cat_var_22,cat_var_23,cat_var_24,cat_var_25,cat_var_26,cat_var_27,cat_var_28,cat_var_29,cat_var_30,cat_var_31,cat_var_32,cat_var_33,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_4,cat_var_40,cat_var_41,cat_var_42,cat_var_5,cat_var_8,cat_var_9,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,target,transaction_id,popularity_cat_var_1,popularity_cat_var_2,popularity_cat_var_3,popularity_cat_var_7,popularity_cat_var_6,popularity_cat_var_10,popularity_cat_var_13,popularity_cat_var_14
0,iq,ep,ep,tn,tn,ep,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,ep,0,0,0,tn,dn,tn,2.302632e-08,0.040182,0.0,1.800000e-07,2.302632e-08,2.368421e-08,1.115205e-08,0.0,id_11,365237,582056,20117,871363,14198,26768,22321,620874
1,iq,iq,tn,tn,tn,tn,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,tn,0,0,0,ep,dn,hr,7.965789e-06,0.157872,0.0,2.105000e-06,2.769737e-07,7.965789e-06,2.433058e-06,0.0,id_33,253029,258183,8585,871363,252763,32032,261510,620874
2,ce,tn,ep,tn,tn,ep,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,tn,0,0,0,ep,dn,hr,7.828947e-08,0.089140,0.0,3.550000e-07,4.671053e-08,1.052632e-07,4.276014e-07,0.0,id_51,365237,582056,3605,871363,352425,80714,332353,620874
3,iq,ep,ep,tn,tn,ep,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,ep,0,0,0,ep,tk,ep,7.894737e-08,0.227239,0.0,1.050000e-06,1.381579e-07,2.190789e-07,1.848054e-08,0.0,id_54,365237,582056,2354,871363,4955,26156,14004,29756
4,hr,iq,tn,tn,tn,tn,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,tn,0,0,0,ep,dn,ep,3.321053e-06,0.160410,0.0,2.105000e-06,2.769737e-07,3.340789e-06,2.152983e-06,0.0,id_62,253029,258183,260,871363,252763,43239,261510,620874
5,tn,tn,ep,tn,tn,ep,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,tn,0,0,0,tn,dn,tn,1.953947e-07,0.078566,0.0,3.550000e-07,4.671053e-08,4.407895e-08,9.463310e-08,0.0,id_67,365237,582056,184452,871363,352425,77365,332353,620874
6,tn,tn,ep,tn,ep,tn,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,ep,0,0,0,ep,hn,ce,1.736842e-07,0.600508,0.0,4.575000e-06,5.986842e-07,1.736842e-07,8.411831e-08,0.0,id_71,1796,582056,184452,871363,1789,77365,2568,33173
7,tn,iq,tn,tn,ep,tn,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,tn,0,0,0,ep,dn,iq,5.789474e-08,0.152374,0.0,2.105000e-06,2.769737e-07,5.789474e-08,2.453451e-08,0.0,id_88,253029,258183,2292,871363,252763,28859,261510,620874
8,iq,ce,ep,ep,tn,tn,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,ep,0,0,0,tn,jo,tn,3.289474e-08,0.093053,0.0,7.500000e-07,8.223684e-08,9.868421e-08,1.274520e-08,0.0,id_95,49788,582056,184452,871363,2386,38575,53281,67241
9,hr,iq,tn,tn,ep,tn,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,tn,0,0,0,ep,dn,tn,5.500000e-07,0.158401,0.0,2.105000e-06,2.769737e-07,5.500000e-07,3.138505e-07,0.0,id_97,253029,258183,3308,871363,252763,32032,261510,620874


In [47]:
def to_popularity(col,train,test):
    df=pd.concat([train,test])
    dic=dict(df[col].value_counts())
    df['popularity_%s'%col]=df[col].apply(lambda x: dic[x])
    df.drop([col],axis=1,inplace=True)
    train=df.iloc[:len(train),:]
    test=df.iloc[len(train):,:]
    return train,test

In [48]:
train,test=to_popularity('cat_var_1',train,test)
train,test=to_popularity('cat_var_2',train,test)
train,test=to_popularity('cat_var_3',train,test)
train,test=to_popularity('cat_var_7',train,test)
train,test=to_popularity('cat_var_6',train,test)
train,test=to_popularity('cat_var_10',train,test)
train,test=to_popularity('cat_var_13',train,test)
train,test=to_popularity('cat_var_14',train,test)

In [154]:
# cat_10={}
# for i in train.cat_var_10.unique():
#     pi=float(len(train[train['cat_var_10']==i][train['target']==1]))
#     ni=float(len(train[train['cat_var_10']==i][train['target']==0]))
#     cat_10[i]=((pi)/(pi+ni))

In [155]:
# train['cat_var_10']=train['cat_var_10'].apply(lambda x: cat_10[x])
# test['cat_var_10']=test['cat_var_10'].apply(lambda x: cat_10[x])

In [156]:
# cat_14={}
# for i in train.cat_var_14.unique():
#     pi=float(len(train[train['cat_var_14']==i][train['target']==1]))
#     ni=float(len(train[train['cat_var_14']==i][train['target']==0]))
#     cat_14[i]=((pi)/(pi+ni))

In [157]:
# train['cat_var_14']=train['cat_var_14'].apply(lambda x: cat_14[x])
# test['cat_var_14']=test['cat_var_14'].apply(lambda x: cat_14[x])

In [158]:
# cat_6={}
# for i in train.cat_var_6.unique():
#     pi=float(len(train[train['cat_var_6']==i][train['target']==1]))
#     ni=float(len(train[train['cat_var_6']==i][train['target']==0]))
#     cat_6[i]=((pi)/(pi+ni))

In [159]:
# train['cat_var_6']=train['cat_var_6'].apply(lambda x: cat_6[x])
# test['cat_var_6']=test['cat_var_6'].apply(lambda x: cat_6[x])

In [160]:
train.drop(['cat_var_8'],axis=1,inplace=True)
test.drop(['cat_var_8'],axis=1,inplace=True)

In [229]:
predictors=[x for x in train.columns if x not in ['target','transaction_id']]
target=['target']

In [163]:
bst,xgpreds=run(train[predictors].values,train[target].values,test[predictors].values,params)

In [39]:
subs=pd.DataFrame({
    'transaction_id':test['transaction_id'],
    'target':f
},columns=['transaction_id','target'])

In [40]:
subs.to_csv('/Users/shashank/Downloads/Societe2/neat%d.csv'%z,index=False)
z=z+1

In [32]:
one=pd.read_csv('/Users/shashank/Downloads/7363.csv')
sec=pd.read_csv('/Users/shashank/Downloads/Societe2/neat3.csv') ##
thi=pd.read_csv('/Users/shashank/Downloads/Societe2/47.csv') ## 0.73683
fou=pd.read_csv('/Users/shashank/Downloads/Societe2/67.csv') ## 0.73653
# fif=pd.read_csv('/Users/shashank/Downloads/Societe2/72.csv')

In [190]:
n4=pd.read_csv('/Users/shashank/Downloads/Societe2/neat4.csv')
n6=pd.read_csv('/Users/shashank/Downloads/Societe2/neat6.csv')

In [195]:
pearsonr(n4['target'],subs['target'])

(0.99997794072406809, 0.0)

In [38]:
f=0.14*one['target']+0.25*sec['target']+0.36*thi['target']+0.25*fou['target']

In [196]:
22+22+22+34

100

In [191]:
f=0.5*n4['target']+0.5*n6['target']

In [None]:
##neat4 -- 0.73711 avg of neat3( all one hots and only popular) and 7363.csv
##neat5 -- 0.73718 avg of neat3( all one hots and only popular) and 72.csv
## neat9 -- 0.73727 avg of 7363.csv,neat3,47.csv,67.csv

In [230]:
train[predictors]

Unnamed: 0,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,cat_var_1,cat_var_2,cat_var_3,cat_var_4,cat_var_5,cat_var_6,cat_var_7,cat_var_8,cat_var_9,cat_var_10,cat_var_11,cat_var_12,cat_var_13,cat_var_14,cat_var_15,cat_var_16,cat_var_17,cat_var_18,cat_var_19,cat_var_20,cat_var_21,cat_var_22,cat_var_23,cat_var_24,cat_var_25,cat_var_26,cat_var_27,cat_var_28,cat_var_29,cat_var_30,cat_var_31,cat_var_32,cat_var_33,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_40,cat_var_41,cat_var_42
0,2.302632e-08,0.040182,0.0,1.800000e-07,2.302632e-08,2.368421e-08,1.115205e-08,,ce,db,ep,tn,mm,ep,,tn,db,iq,ep,ip,db,ep,tn,tn,ep,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7.965789e-06,0.157872,0.0,2.105000e-06,2.769737e-07,7.965789e-06,2.433058e-06,da,tn,zl,tn,ep,hm,ep,,hr,qt,iq,iq,hm,db,tn,tn,tn,tn,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,7.828947e-08,0.089140,0.0,3.550000e-07,4.671053e-08,1.052632e-07,4.276014e-07,gf,ce,gs,tn,ep,zs,ep,dn,hr,mm,ce,tn,hr,db,ep,tn,tn,ep,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,7.894737e-08,0.227239,0.0,1.050000e-06,1.381579e-07,2.190789e-07,1.848054e-08,,ce,fy,ep,ep,qw,ep,tk,ep,es,iq,ep,ce,tn,ep,tn,tn,ep,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,3.321053e-06,0.160410,0.0,2.105000e-06,2.769737e-07,3.340789e-06,2.152983e-06,da,tn,zn,tn,ep,hm,ep,,ep,xy,hr,iq,hm,db,tn,tn,tn,tn,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1.953947e-07,0.078566,0.0,3.550000e-07,4.671053e-08,4.407895e-08,9.463310e-08,gf,ce,qt,tn,tn,zs,ep,dn,tn,td,tn,tn,hr,db,ep,tn,tn,ep,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1.736842e-07,0.600508,0.0,4.575000e-06,5.986842e-07,1.736842e-07,8.411831e-08,hn,ce,qt,ep,ep,ts,ep,hn,ce,td,tn,tn,db,td,ep,tn,ep,tn,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,5.789474e-08,0.152374,0.0,2.105000e-06,2.769737e-07,5.789474e-08,2.453451e-08,da,tn,xp,tn,ep,hm,ep,,iq,ip,tn,iq,hm,db,tn,tn,ep,tn,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,3.289474e-08,0.093053,0.0,7.500000e-07,8.223684e-08,9.868421e-08,1.274520e-08,pu,ce,,ep,tn,jt,ep,jo,tn,rv,iq,ce,es,ep,ep,ep,tn,tn,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,5.500000e-07,0.158401,0.0,2.105000e-06,2.769737e-07,5.500000e-07,3.138505e-07,da,tn,tl,tn,ep,hm,ep,,tn,qt,hr,iq,hm,db,tn,tn,ep,tn,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
