In [30]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
from pathlib import Path

In [0]:
PATH_TO_DATA = Path('/content/gdrive/My Drive/mlcourse/part_3/')

In [0]:
times = ['time%s' % i for i in range(1, 11)]

In [0]:
df_train = pd.read_csv(PATH_TO_DATA / 'train_sessions.csv', index_col='session_id', parse_dates=times)
df_test  = pd.read_csv(PATH_TO_DATA / 'test_sessions.csv',  index_col='session_id', parse_dates=times)

In [0]:
# Sort the data by time
df_train = df_train.sort_values(by="time1")

In [0]:
#Fill NaN with zero values
sites = ['site%s' % i for i in range(1, 11)]
df_train[sites] = df_train[sites].fillna(0).astype(np.uint16)
df_test[sites]  =  df_test[sites].fillna(0).astype(np.uint16)

###Full DF

In [0]:
# Our target variable
y_train = df_train['target']

# United dataframe of the initial data 
df_full = pd.concat([df_train.drop('target', axis=1), df_test])

# Index to split the training and test data sets
idx_split = df_train.shape[0]

# Dataframe with indices of visited websites in session
df_full_sites = df_full[sites]

In [0]:
#Transform data into sparce matrix
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)

sessions = [" ".join([str(site) for site in row]) for row in (df_full_sites.astype('int')).values.tolist()]
full_sites_sparse = cv.fit_transform(sessions)

###3. Feature creating

In [0]:
#features for scale
from sklearn.preprocessing import StandardScaler

def for_scale_df(df):
  #scaled
  df_features = pd.DataFrame(index=df.index)
  #df_features['month']   = df['time1'].apply(lambda ts: ts.month)
  df_features['year']    = df['time1'].apply(lambda ts: ts.year).astype('int')
  df_features['weekday'] = df['time1'].apply(lambda ts: ts.weekday()).astype('int')
  #df_features['year_month'] = df['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5

  df_features['session_duration'] = (df[times].max(axis=1) - df[times].min(axis=1)).astype('timedelta64[ms]').astype(int)
  return df_features

df_features_train = for_scale_df(df_train)
df_features_test  = for_scale_df(df_test)

In [0]:
scaler = StandardScaler()
df_features_train = scaler.fit_transform(df_features_train)
df_features_test = scaler.transform(df_features_test)

In [0]:
#categorical features
def mk_cat_feat(df):
  df_cat = pd.DataFrame(index=df.index)
  hour = df['time1'].apply(lambda ts: ts.hour).astype('int')
  month   = df['time1'].apply(lambda ts: ts.month).astype('int')
  df_cat['morning'] = ((hour >= 7 ) & (hour <= 11)).astype('int')
  df_cat['day']     = ((hour >= 12) & (hour <= 18)).astype('int')
  df_cat['evening'] = ((hour >= 19) & (hour <= 23)).astype('int')
  df_cat['summer']  = ((month >= 6) & (month <= 8)).astype('int')
  df_cat['weekday'] = df['time1'].apply(lambda ts: ts.weekday()).astype('int')
  return df_cat

df_cat_feat_train = mk_cat_feat(df_train)
df_cat_feat_test  = mk_cat_feat(df_test)

###4. Make train and test splits

In [0]:
from scipy.sparse import hstack

In [0]:
X_train = hstack([df_features_train, df_cat_feat_train, full_sites_sparse[:idx_split, :]])
X_test  = hstack([df_features_test,  df_cat_feat_test,  full_sites_sparse[idx_split:, :]])

###5. GRIDSEARCH

###6. CV

In [44]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit

time_split = TimeSeriesSplit(n_splits=10)

logit = LogisticRegression(C=0.2, random_state=17, solver='newton-cg')

cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split,  scoring='roc_auc', n_jobs= -1)

cv_scores.mean()



0.9214606037132722

In [0]:
#v0: 0.9214606037132722 (C = 0.2)

###7. Predict

In [46]:
logit.fit(X_train, y_train)

LogisticRegression(C=0.2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
alice_pred = logit.predict(X_test).tolist()

In [0]:
df_test['target'] = alice_pred

In [0]:
df_test.to_csv(PATH_TO_DATA / 'pred.csv')

In [0]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]

In [0]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [0]:
write_to_submission_file(logit_test_pred, PATH_TO_DATA / 'alice_2019-10-25_v0.csv')