In [1]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier,ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# Reading input train and test data
train = pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

# Fill NA values
train=train.fillna(value=-10)
test=test.fillna(value=-10)

# get response variable
target=train['Active_Customer']
train.drop(['Active_Customer','Cust_id'],axis=1,inplace=True)

# saving test id
testID=test['Cust_id']
test.drop(['Cust_id'],axis=1,inplace=True)

# Checking test and train descriptions
#train.describe()
#test.describe()

# collecting the number of train instances
num_train=train.shape[0]

# concatenate train and test to a single object
df=pd.concat([train,test])

#df.describe()

# Converting dtype of character objects to categorical values
for column in df:
    if str(df[column].dtype)=='object':
        print(column,str(df[column].dtype))
        df[column] = df[column].astype('category')       

('Cust_status', 'object')
('Trans24', 'object')
('Trans25', 'object')
('Trans26', 'object')
('Trans27', 'object')


In [2]:
# Dummy encoding for the categorical variables
# drop_first not set for 0.17 version of pandas
df=pd.get_dummies(df,drop_first=True)
df.describe()

# Fill NA values
df=df.fillna(value=-10)


# Collecting the promotion columns
promotion_index=[]
id=0
for column in df:
    if column.find('Promotion')!=-1:
        promotion_index.append(id)
    id=id+1
print id
print promotion_index

255
[202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249]


In [3]:
# Functions,code for feature engineering

def getCounts(x):
    count=0
    for i in x:
        if i==0:
            count=count+1
    return count

def naCounts(x):
    count=0
    for i in x:
        if i==-10:
            count=count+1
    return count

def getnum_promotions(x,promotion_index):
    count=0
    for i in promotion_index:
        if x[i]!=-10 and x[i]>0:
            count=count+1
    return count

def getsum_promotions(x,promotion_index):
    sum=0
    for i in promotion_index:
        if x[i]!=-10 and x[i]>0:
            sum=sum+x[i]
    return sum

#sum of zeros across the instances
df['zeroCounts']=df.apply(getCounts,axis=1)



#sum of NAs across the instances
df['negativeCounts']=df.apply(naCounts,axis=1)


#sum of promotions across the instances
df['promotionCounts']=df.apply(lambda x : getnum_promotions(x,promotion_index),axis=1)


#totalnumber of promotions not equal to zeros
df['sum_promotionCounts'] =df.apply(lambda x : getsum_promotions(x,promotion_index),axis=1)




In [4]:
df['sum_promotionCounts'].describe()

count    36808.000000
mean         0.747530
std          1.356719
min          0.000000
25%          0.000000
50%          0.000000
75%          1.030000
max         16.200000
Name: sum_promotionCounts, dtype: float64

In [8]:
# Training a extra trees classifier
from sklearn import metrics
from sklearn import cross_validation
from sklearn.preprocessing import MinMaxScaler
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

et = ExtraTreesClassifier(n_estimators=900,max_features= 80,criterion= 'entropy',min_samples_split= 2,
                          max_depth= 60, min_samples_leaf= 2,random_state=1,
                          verbose=1,n_jobs = 3)

gbt = GradientBoostingClassifier(loss='deviance',learning_rate=0.02, subsample=0.9, 
                               random_state=1,max_depth=6, n_estimators=160,verbose=1)

rf = RandomForestClassifier(n_estimators=1000, n_jobs=3,max_features=90,max_depth=80,criterion='entropy',verbose=1)

etr= ExtraTreesRegressor(n_estimators=700,max_features= 70,criterion= 'mse',min_samples_split= 2,
                          max_depth= 60, min_samples_leaf= 2,random_state=2,
                          verbose=1,n_jobs = 3)

In [None]:
# Grid Search block for each classifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

# param_grid = {"max_features": [90],
#               "criterion":['entropy'],
#               "max_depth":[80]
#               }
param_grid = {"subsample": [1],
              "max_depth":[7,8]
              }


# run grid search
grid_search = GridSearchCV(gbt, param_grid=param_grid,scoring='accuracy',cv=5)
#start = time()
grid_search.fit(df.iloc[0:num_train,],target)
for i in grid_search.grid_scores_:
    print i

In [None]:
#Check cross validation for single model

# predicted = cross_validation.cross_val_predict(etr,df.iloc[0:num_train,],target, cv=5)
# predicted=[i/predicted.max() for i in predicted]
# for i in range(0,len(predicted)):
#     if predicted[i]>0.5:
#         predicted[i]=1
#     else:
#         predicted[i]=0
# print metrics.accuracy_score(predicted,target)
scores=cross_validation.cross_val_score(gbt,df.iloc[0:num_train,],target,cv=5,scoring='accuracy')
print scores
print("Accuracy: %f (+/- %f)" % (scores.mean(), scores.std() * 2))

In [9]:
clfs=[gbt,et,rf,etr]

# Working on Stacker
print "Creating train and test sets for blending."

dataset_blend_train = np.zeros((train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((test.shape[0], len(clfs)))


# Selecting startified sampling
nfolds=2
skf = list(StratifiedKFold(target,nfolds))

for j, clf in enumerate(clfs):
        print j, clf
        dataset_blend_test_j = np.zeros((test.shape[0], len(skf)))
        for i, (trainIndex, testIndex) in enumerate(skf):
            print "Fold", i+1
            X_train = df.iloc[trainIndex,]
            y_train = target[trainIndex]
            X_test = df.iloc[testIndex,]
            y_test = target[testIndex]
            clf.fit(X_train, y_train)
            if j!=3:
                y_submission = clf.predict_proba(X_test)[:,1]
            else:
                y_submission = clf.predict(X_test)
                y_submission=[i/y_submission.max() for i in y_submission]
            dataset_blend_train[testIndex, j] = y_submission
            if j!=3:
                dataset_blend_test_j[:, i] = clf.predict_proba(df.iloc[num_train:,])[:,1]
            else:
                y_submission = clf.predict(df.iloc[num_train:,])
                y_submission = [i/y_submission.max() for i in y_submission]
                dataset_blend_test_j[:, i]=y_submission
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)

Creating train and test sets for blending.
0 GradientBoostingClassifier(init=None, learning_rate=0.02, loss='deviance',
              max_depth=6, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=160,
              presort='auto', random_state=1, subsample=0.9, verbose=1,
              warm_start=False)
Fold 1
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3780           0.0074            1.41m
         2           1.3702           0.0071            1.40m
         3           1.3622           0.0055            1.39m
         4           1.3552           0.0068            1.37m
         5           1.3482           0.0060            1.36m
         6           1.3408           0.0054            1.35m
         7           1.3337           0.0046            1.34m
         8           1.3275           0.0050            1.33m
         9           1.321

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    4.9s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   21.7s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:   54.7s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 900 out of 900 | elapsed:  1.8min finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.9s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    1.7s
[Parallel(n_jobs=3)]: Done 900 out of 900 | elapsed:    1.9s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    1.4s
[Parallel(n_jobs=3)]: Done 900 out of 900 | elapsed:    1.6s finished


 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=60, max_features=80, max_leaf_nodes=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=900, n_jobs=3,
           oob_score=False, random_state=1, verbose=1, warm_start=False)
Fold 1
Fold

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    5.3s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   23.2s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:   52.7s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 900 out of 900 | elapsed:  1.8min finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.9s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    1.6s
[Parallel(n_jobs=3)]: Done 900 out of 900 | elapsed:    1.8s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    1.3s
[Parallel(n_jobs=3)]: Done 900 out of 900 | elapsed:    1.5s finished


 2
2

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    8.7s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   40.8s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:  3.8min finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    1.3s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    2.4s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    2.9s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.8s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    1.5s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    2.0s finished


 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=80, max_features=90, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=3,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)
Fold 1
Fold

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    9.7s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   44.9s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:  3.8min finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    1.1s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    1.9s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    2.4s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.9s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    1.6s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    2.2s finished


 2
3

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    5.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   24.8s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:   46.9s
[Parallel(n_jobs=3)]: Done 700 out of 700 | elapsed:  1.2min finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done 700 out of 700 | elapsed:    1.3s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done 700 out of 700 | elapsed:    1.1s finished


 ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=60,
          max_features=70, max_leaf_nodes=None, min_samples_leaf=2,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=700, n_jobs=3, oob_score=False, random_state=2,
          verbose=1, warm_start=False)
Fold 1
Fold

[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    3.9s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   16.8s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:   44.3s
[Parallel(n_jobs=3)]: Done 700 out of 700 | elapsed:  1.2min finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done 700 out of 700 | elapsed:    1.2s finished
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done 700 out of 700 | elapsed:    1.1s finished


 2


In [13]:
clf = LogisticRegression()
scores=cross_validation.cross_val_score(clf,dataset_blend_train,target,cv=5,scoring='accuracy')
print scores
print("Accuracy: %f (+/- %f)" % (scores.mean(), scores.std() * 1))

[ 0.69014358  0.68781529  0.68044237  0.67003106  0.68361801]
Accuracy: 0.682410 (+/- 0.007037)


In [14]:
print "Blending with Logistic Regression"
clf = LogisticRegression()
clf.fit(dataset_blend_train, target)
predicted = clf.predict_proba(dataset_blend_test)[:,1]

Blending with Logistic Regression


In [44]:
predicted=[i/predicted.max() for i in predicted]

In [45]:
pd.Series(predicted).describe()

count    11042.000000
mean         0.552610
std          0.237213
min          0.168935
25%          0.337965
50%          0.512333
75%          0.761002
max          1.000000
dtype: float64

In [15]:
for i in range(0,len(predicted)):
    if predicted[i]>0.5:
        predicted[i]=1
    else:
        predicted[i]=0
submission=pd.DataFrame({'Cust_id':testID,
                        'Active_Customer':pd.Series(predicted)})
submission.to_csv("submission_1.csv", index=False)