In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Normalizer, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
#import eli5

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
def show_confusion_matrix(y_true,y_pred,title='Confusionmatrix'):
    table=confusion_matrix(y_true,y_pred)
    fig,ax=plt.subplots(frameon=False)
    fig.set_size_inches(4,3)
    fig.suptitle(title,fontsize=20)
    ax.axis('off')
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)

    the_table=ax.table(cellText=table,
                        colWidths=[0.5]*len([0,1]),
                        rowLabels=['True 0','True 1'],colLabels=['Predicted 0','Predicted 1'],
                        cellLoc='center',rowLoc='center',loc="center")
    the_table.set_fontsize(34)
    the_table.scale(1,4)
    plt.show()

In [4]:
with open('../input/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/site_dic.pkl', 'rb') as input_file:
    site_dict = pickle.load(input_file)

In [5]:
times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv('../input/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv('../input/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')


# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [6]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)

In [7]:
!head -5 train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


In [8]:
#cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
#cv = TfidfVectorizer(ngram_range=(1, 3), max_df=0.9)
cv = TfidfVectorizer(ngram_range=(1, 3), max_features=25000, binary=True, sublinear_tf=True)

with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)


X_train.shape, X_test.shape

((253561, 25000), (82797, 25000))

In [9]:
X_train.mean(), X_train.min(), X_train.max()

(0.0001294681916051883, 0.0, 1.0)

In [10]:
y_train = train_df['target'].astype('int').values

In [11]:
time_split = TimeSeriesSplit(n_splits=10)

In [12]:
logit2 = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [13]:
#cv_scores = cross_val_score(logit2, X_train, y_train, cv=time_split, 
                            #scoring='roc_auc')

In [14]:
#cv_scores, cv_scores.mean() # 0.869185314739268

In [15]:
def is_alice_day_of_week(x):
    x = x.weekday()
    if x == 0 or x == 1 or x == 3 or x == 4:
      return 1
    return 0


In [16]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    aalice_hour = ((hour >= 16) & (hour <= 17)).astype('int')
    
    # added in 27
    day_of_week_all = df['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
    day_of_week = df['time1'].apply(lambda x: is_alice_day_of_week(x)).values.reshape(-1, 1)
    #foo['verisign_ids start8'] = df['site8'].apply(lambda x: is_site(x, verisign_ids))
    
    month = df['time1'].apply(lambda t: t.month).values.reshape(-1, 1)
    #year_month = times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
    
    # aalice_hour and most common day
    
    X = hstack([X_sparse,
                morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1),
                evening.values.reshape(-1, 1),
                night.values.reshape(-1, 1),
                #aalice_hour.values.reshape(-1, 1),
                day_of_week,
                #month,
                day_of_week_all,
                #alice_hour.reshape(-1, 1)
                #year_month
                ])
    return X

In [17]:
X_train_new = add_time_features(train_df, X_train)
X_test_new = add_time_features(test_df, X_test)

In [18]:
X_train_new.shape, X_test_new.shape

((253561, 25006), (82797, 25006))

In [19]:
def add_start_month_feature(df, X_sparse):
    foo = pd.DataFrame(index=df.index)
    bar = df['time1'].apply(lambda ts: 100 * ts.year + ts.month).astype('float64')
    foo['scaled_month'] = StandardScaler().fit_transform(bar.values.reshape(-1, 1))
    
    #foo['scaled_month'] = df['time1'].apply(lambda ts: (100 * ts.year + ts.month) / 1e5).astype('float64')
    
    X = hstack([X_sparse, foo])
    return X

In [20]:
X_train_new2 = add_start_month_feature(train_df, X_train_new)
X_test_new2 = add_start_month_feature(test_df, X_test_new)

In [21]:
def add_session_length_feature(df, X_sparse):    
    foo = pd.DataFrame(index=df.index)
    foo['min_seconds'] = df[times].min(axis=1)
    foo['max_seconds'] = df[times].max(axis=1)
    foo['seconds'] = (foo['max_seconds'] - foo['min_seconds']) / np.timedelta64(1, 's')

    foo['scaled_session_duration_seconds'] = MinMaxScaler().fit_transform(foo['seconds'].values.reshape(-1, 1))
    #foo['scaled_session_duration_seconds'] = StandardScaler().fit_transform(foo['seconds'].values.reshape(-1, 1))
    
    #foo['month'] = df['time1'].apply(lambda t: t.month).values.reshape(-1, 1) 
    #foo['day_of_week'] = df['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
    #foo['year_month'] = df['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
    
    #foo['day_of_week'] = df['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
    #foo['scaled_day_of_week'] = MinMaxScaler().fit_transform(foo['day_of_week'].values.reshape(-1, 1))
    
    # Added in 28
    #foo['lt_48_secs'] = (foo['seconds'] < 48)
    #foo['lt_48_secs'] = foo['lt_48_secs'].astype('float64')

    #foo['gte_48_secs'] = (foo['seconds'] >= 48)
    #foo['gte_48_secs'] = foo['gte_48_secs'].astype('float64')
    
    foo = foo.drop(columns=['min_seconds', 'max_seconds', 'seconds',])
    #foo = foo.drop(columns=['min_seconds', 'max_seconds', 'seconds', 'scaled_session_duration_seconds'])
    
    X = hstack([X_sparse, foo])
    return X
    #return foo

In [22]:
X_train_new3 = add_session_length_feature(train_df, X_train_new2)
X_test_new3 = add_session_length_feature(test_df, X_test_new2)

In [23]:
cv_scores = cross_val_score(logit2, X_train_new3, y_train, cv=time_split, 
                            scoring='roc_auc')

In [24]:
cv_scores, cv_scores.std(), cv_scores.mean()

(array([0.90804806, 0.87704985, 0.86902752, 0.98145326, 0.93079662,
        0.96936264, 0.94564689, 0.95930799, 0.90358909, 0.97097777]),
 0.038285704307677526,
 0.9315259678967515)

In [25]:
facebook_ids = []
youtube_ids = []
google_video_ids = []
nih_ids = []
gmail_ids = []
annotathon_ids = []
bing_ids = []
phylogeny_ids = []
oracle_ids = []
youwatch_ids = []
digi_ids = []
verisign_ids = []

for key in list(site_dict.keys()):
    if 'facebook' in key:
        facebook_ids.append(site_dict[key])
    if 'youtube' in key or 'ytimg' in key:
        youtube_ids.append(site_dict[key])
    if 'googlevideo.com' in key:
        google_video_ids.append(site_dict[key])
    if 'nih.gov' in key:
        nih_ids.append(site_dict[key])
    if 'mail.google.com' in key:
        gmail_ids.append(site_dict[key])
    if 'annotathon.org' in key:
        annotathon_ids.append(site_dict[key])
    if 'bing.com' == key:
        bing_ids.append(site_dict[key])
    if 'phylogeny.fr' in key:
        phylogeny_ids.append(site_dict[key])
    if 'javadl-esd-secure.oracle.com' in key or 'download.jboss.org' in key:
        oracle_ids.append(site_dict[key])
    if 'plus.google.com' in key in key:
        youwatch_ids.append(site_dict[key])
    if 'safebrowsing-cache.google.com' in key or 'safebrowsing.clients.google.com' in key:
        digi_ids.append(site_dict[key])
    if 'ocsp.verisign.com' in key or 'gtssl-ocsp.geotrust.com' in key:
        verisign_ids.append(site_dict[key])
        
print(youtube_ids)



[38132, 41285, 80, 38133, 15317, 656, 16672, 874, 837, 6581, 38134, 24557, 77, 1307, 12597, 76, 14774, 1345, 75, 74, 240, 876, 16606, 873, 28062, 2329, 1056, 676, 2574, 13347]


In [26]:
def is_site(x, l):
    if x in l:
      return 4
    return 0

def is_long_session(x):
    if x < 3:
        return 0
    elif x < 5:
        return 1
    elif x < 10:
        return 2
    elif x < 30:
        return 3
    elif x < 40:
        return 4
    return 5

In [27]:
def add_social_network_feature(df, X_sparse):    
    foo = pd.DataFrame(index=df.index)
    
    foo['time1'] = df[['time1']].apply(pd.to_datetime)
    # TODO should not fillna, instead find the last date?
    foo['time10'] = df[['time10']].fillna('2014-02-20 10:02:45').apply(pd.to_datetime)
    
    foo['verisign_ids start'] = df['site1'].apply(lambda x: is_site(x, verisign_ids))
    foo['verisign_ids start2'] = df['site2'].apply(lambda x: is_site(x, verisign_ids))
    foo['verisign_ids start3'] = df['site3'].apply(lambda x: is_site(x, verisign_ids))
    foo['verisign_ids start4'] = df['site4'].apply(lambda x: is_site(x, verisign_ids))
    foo['verisign_ids start5'] = df['site5'].apply(lambda x: is_site(x, verisign_ids))
    foo['verisign_ids start6'] = df['site6'].apply(lambda x: is_site(x, verisign_ids))
    foo['verisign_ids start7'] = df['site7'].apply(lambda x: is_site(x, verisign_ids))
    foo['verisign_ids start8'] = df['site8'].apply(lambda x: is_site(x, verisign_ids))
    foo['verisign_ids start9'] = df['site9'].apply(lambda x: is_site(x, verisign_ids))
    foo['verisign_ids start10'] = df['site10'].apply(lambda x: is_site(x, verisign_ids))
    
    foo['digi start'] = df['site1'].apply(lambda x: is_site(x, digi_ids))
    foo['digi start2'] = df['site2'].apply(lambda x: is_site(x, digi_ids))
    foo['digi start3'] = df['site3'].apply(lambda x: is_site(x, digi_ids))
    foo['digi start4'] = df['site4'].apply(lambda x: is_site(x, digi_ids))
    foo['digi start5'] = df['site5'].apply(lambda x: is_site(x, digi_ids))
    foo['digi start6'] = df['site6'].apply(lambda x: is_site(x, digi_ids))
    foo['digi start7'] = df['site7'].apply(lambda x: is_site(x, digi_ids))
    foo['digi start8'] = df['site8'].apply(lambda x: is_site(x, digi_ids))
    foo['digi start9'] = df['site9'].apply(lambda x: is_site(x, digi_ids))
    foo['digi start10'] = df['site10'].apply(lambda x: is_site(x, digi_ids))
    
    
    foo['youwatch start'] = df['site1'].apply(lambda x: is_site(x, youwatch_ids))
    foo['youwatch start2'] = df['site2'].apply(lambda x: is_site(x, youwatch_ids))
    foo['youwatch start3'] = df['site3'].apply(lambda x: is_site(x, youwatch_ids))
    foo['youwatch start4'] = df['site4'].apply(lambda x: is_site(x, youwatch_ids))
    foo['youwatch start5'] = df['site5'].apply(lambda x: is_site(x, youwatch_ids))
    foo['youwatch star6'] = df['site6'].apply(lambda x: is_site(x, youwatch_ids))
    foo['youwatch start7'] = df['site7'].apply(lambda x: is_site(x, youwatch_ids))
    foo['youwatch start8'] = df['site8'].apply(lambda x: is_site(x, youwatch_ids))
    foo['youwatch start9'] = df['site9'].apply(lambda x: is_site(x, youwatch_ids))
    foo['youwatch start10'] = df['site10'].apply(lambda x: is_site(x, youwatch_ids))

    foo['oracle start1'] = df['site1'].apply(lambda x: is_site(x, oracle_ids))
    foo['oracle start2'] = df['site2'].apply(lambda x: is_site(x, oracle_ids))
    foo['oracle start3'] = df['site3'].apply(lambda x: is_site(x, oracle_ids))
    foo['oracle start4'] = df['site4'].apply(lambda x: is_site(x, oracle_ids))
    foo['oracle start5'] = df['site5'].apply(lambda x: is_site(x, oracle_ids))
    foo['oracle start6'] = df['site1'].apply(lambda x: is_site(x, oracle_ids))
    foo['oracle start7'] = df['site2'].apply(lambda x: is_site(x, oracle_ids))
    foo['oracle start8'] = df['site3'].apply(lambda x: is_site(x, oracle_ids))
    foo['oracle start9'] = df['site4'].apply(lambda x: is_site(x, oracle_ids))
    foo['oracle start10'] = df['site5'].apply(lambda x: is_site(x, oracle_ids))

    foo['gmail start'] = df['site1'].apply(lambda x: is_site(x, gmail_ids))
    foo['gmail start2'] = df['site2'].apply(lambda x: is_site(x, gmail_ids))
    foo['gmail start3'] = df['site3'].apply(lambda x: is_site(x, gmail_ids))
    foo['gmail start4'] = df['site4'].apply(lambda x: is_site(x, gmail_ids))
    foo['gmail start5'] = df['site5'].apply(lambda x: is_site(x, gmail_ids))
    foo['gmail start6'] = df['site6'].apply(lambda x: is_site(x, gmail_ids))
    foo['gmail start7'] = df['site7'].apply(lambda x: is_site(x, gmail_ids))
    foo['gmail start8'] = df['site8'].apply(lambda x: is_site(x, gmail_ids))
    foo['gmail start9'] = df['site9'].apply(lambda x: is_site(x, gmail_ids))
    foo['gmail start10'] = df['site10'].apply(lambda x: is_site(x, gmail_ids))
    
    foo['bing start1'] = df['site1'].apply(lambda x: is_site(x, bing_ids))
    foo['bing start2'] = df['site2'].apply(lambda x: is_site(x, bing_ids))
    foo['bing start3'] = df['site3'].apply(lambda x: is_site(x, bing_ids))
    foo['bing start4'] = df['site4'].apply(lambda x: is_site(x, bing_ids))
    foo['bing start5'] = df['site5'].apply(lambda x: is_site(x, bing_ids))
    foo['bing start6'] = df['site6'].apply(lambda x: is_site(x, bing_ids))
    foo['bing start7'] = df['site7'].apply(lambda x: is_site(x, bing_ids))
    foo['bing start8'] = df['site8'].apply(lambda x: is_site(x, bing_ids))
    foo['bing start9'] = df['site9'].apply(lambda x: is_site(x, bing_ids))
    foo['bing start10'] = df['site10'].apply(lambda x: is_site(x, bing_ids))
    
    foo['phylogeny start'] = df['site1'].apply(lambda x: is_site(x, phylogeny_ids))
    foo['phylogeny start2'] = df['site2'].apply(lambda x: is_site(x, phylogeny_ids))
    foo['phylogeny start3'] = df['site3'].apply(lambda x: is_site(x, phylogeny_ids))
    foo['phylogeny start4'] = df['site4'].apply(lambda x: is_site(x, phylogeny_ids))
    foo['phylogeny start5'] = df['site5'].apply(lambda x: is_site(x, phylogeny_ids))
    foo['phylogeny start6'] = df['site6'].apply(lambda x: is_site(x, phylogeny_ids))
    foo['phylogeny start7'] = df['site7'].apply(lambda x: is_site(x, phylogeny_ids))
    foo['phylogeny start8'] = df['site8'].apply(lambda x: is_site(x, phylogeny_ids))
    foo['phylogeny start9'] = df['site9'].apply(lambda x: is_site(x, phylogeny_ids))
    foo['phylogeny start10'] = df['site10'].apply(lambda x: is_site(x, phylogeny_ids))

    foo['nih start'] = df['site1'].apply(lambda x: is_site(x, nih_ids))
    foo['nih start2'] = df['site2'].apply(lambda x: is_site(x, nih_ids))
    foo['nih start3'] = df['site3'].apply(lambda x: is_site(x, nih_ids))
    foo['nih start4'] = df['site4'].apply(lambda x: is_site(x, nih_ids))
    foo['nih start5'] = df['site5'].apply(lambda x: is_site(x, nih_ids))
    foo['nih start6'] = df['site6'].apply(lambda x: is_site(x, nih_ids))
    foo['nih start7'] = df['site7'].apply(lambda x: is_site(x, nih_ids))
    foo['nih start8'] = df['site8'].apply(lambda x: is_site(x, nih_ids))
    foo['nih start9'] = df['site9'].apply(lambda x: is_site(x, nih_ids))
    foo['nih start10'] = df['site10'].apply(lambda x: is_site(x, nih_ids))

    foo['annotathon start'] = df['site1'].apply(lambda x: is_site(x, annotathon_ids))
    foo['annotathon start2'] = df['site2'].apply(lambda x: is_site(x, annotathon_ids))
    foo['annotathon start3'] = df['site3'].apply(lambda x: is_site(x, annotathon_ids))
    foo['annotathon start4'] = df['site4'].apply(lambda x: is_site(x, annotathon_ids))
    foo['annotathon start5'] = df['site5'].apply(lambda x: is_site(x, annotathon_ids))
    foo['annotathon start6'] = df['site6'].apply(lambda x: is_site(x, annotathon_ids))
    foo['annotathon start7'] = df['site7'].apply(lambda x: is_site(x, annotathon_ids))
    foo['annotathon start8'] = df['site8'].apply(lambda x: is_site(x, annotathon_ids))
    foo['annotathon start9'] = df['site9'].apply(lambda x: is_site(x, annotathon_ids))
    foo['annotathon start10'] = df['site10'].apply(lambda x: is_site(x, annotathon_ids))
    

    
    foo['start day'] = foo['time1'].apply(pd.datetime.weekday)
    foo['end day'] = foo['time10'].apply(pd.datetime.weekday)
    
    foo = foo.drop(columns=['time1', 'time10'])
    
    X = hstack([X_sparse, foo])
    return X

In [28]:
X_train_new4 = add_social_network_feature(train_df, X_train_new3)
X_test_new4 = add_social_network_feature(test_df, X_test_new3)

In [29]:
cv_scores = cross_val_score(logit2, X_train_new4, y_train, cv=time_split, 
                            scoring='roc_auc')

In [30]:
cv_scores, cv_scores.std(), cv_scores.mean() # 0.934 0.9355990452519014 0.9383925

(array([0.91510007, 0.88829327, 0.8808899 , 0.98178108, 0.93578611,
        0.97177937, 0.95512493, 0.96186566, 0.92008902, 0.97321655]),
 0.03426625001152704,
 0.9383925963246806)

In [31]:
params = {
    'C': np.logspace(-2, 2, 10)
}

In [32]:
logit_grid_searcher = GridSearchCV(estimator=logit2, param_grid=params,
                                  scoring='roc_auc', cv=time_split, verbose=1)

In [33]:
logit_grid_searcher.fit(X_train_new4, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  6.1min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=17, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_scor

In [34]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.938744616706427, {'C': 1.6681005372000592})

In [35]:
logit_test_pred = logit_grid_searcher.best_estimator_.predict_proba(X_test_new4)[:, 1]
write_to_submission_file(logit_test_pred, 'submissions33331.csv') #0.95871