In [1]:
import numpy as np
import pandas as pd
import scipy
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from tqdm.notebook import tqdm

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

## 1. Read data

In [None]:
path='../../Data/Extracted/'

############
### Here load condensed text data if required ###
### Get the code for that from the last cell ###
#############

train_text=pd.read_csv(path+'train_multi_column_text.csv')
test_text=pd.read_csv(path+'test_multi_column_text.csv')

train_num=pd.read_csv(path+'train.csv').select_dtypes(exclude='object')
train_num.fillna(train_num.median(),inplace=True)

test_num=pd.read_csv(path+'test.csv').select_dtypes(exclude='object')
test_num.fillna(test_num.median(),inplace=True)

############
### Here load binned numeric data if required ###
### Get the code for that from the last cell ###
#############

labels=pd.read_csv(path+'labels.csv')
labels=pd.get_dummies(labels,prefix_sep='__')
#for feat in labels.columns:
#    labels[feat],_=pd.factorize(labels[feat],sort=True)

## 2. Scaling numeric data sets

In [None]:
# Nomalize train and test data sets
train_num=preprocessing.normalize(train_num)
test_num=preprocessing.normalize(test_num)

# scale and shift the data set with standard scaler
std=preprocessing.StandardScaler()
train_num=std.fit_transform(train_num)
test_num=std.transform(test_num)

## 3. Vectorize train and test text by TfidfVectorizer

In [None]:
train_csr=[train_num]
test_csr=[test_num]

############
### Here tfidf vectorize single column text vector if required ###
### Get the code for that from the last cell ###
############

for feat in tqdm(train_text.columns):
    vectorizer=TfidfVectorizer(ngram_range=(1,4),min_df=10)
    train_csr.append(vectorizer.fit_transform(train_text[feat].values.ravel()))
    test_csr.append(vectorizer.transform(test_text[feat].values.ravel()))

## 3. Load target encoding of the categorical features from the extracted folder

In [None]:
############
### Here load target encoding if required ###
### Get the code for that from the last cell ###
############

In [None]:
X_train=scipy.sparse.hstack(train_csr).tocsr()
X_test=scipy.sparse.hstack(test_csr).tocsr()

## 4. Train simple online logistic regression with SGDClassifier

In [None]:
from sklearn import linear_model
from sklearn import model_selection
from sklearn import metrics
import os

#https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/37753#
def iter_minibatches(chunksize, X,y):
    # Provide chunks one by one
    chunk_start_marker = 0
    while chunk_start_marker < X.shape[0]:
        chunkrows = range(chunk_start_marker, min(chunk_start_marker + chunksize,X.shape[0]))
        # you need to implement "getrows" (based on what your data source is - RAM/Disc/...)
        X_chunk, y_chunk = X[chunkrows],y[chunkrows]
        yield X_chunk, y_chunk # get next portion of data to train on
        chunk_start_marker += chunksize # update starting point

score=0
for label in tqdm(labels.columns):
    # label as a target feature for the current model
    y_train=labels[label].values
                        
    # Path to store best model while training and early stopping
    path='./best_model/'+label
    if not os.path.exists(path):
        os.makedirs(path)
    
    # train test split to evaluate model performance on the cv data set to decide early stopping
    X_tr,X_cv,y_tr,y_cv=model_selection.train_test_split(X_train,y_train,test_size=0.2,stratify=y_train,random_state=44)

    # model to be train
    clf=linear_model.SGDClassifier(loss='log',
                                 eta0=0.001,
                                 validation_fraction=0.2,
                                 early_stopping=False,
                                 n_jobs=-1) # Estimator
    
    improvement=[10e10] # Track cv score while fitting the model
    epoch=0
    cnt=0
    patience=10 # Number of epochs to wait without improvement in cv loss
    max_epochs=1000
    tol=0.001
    
    # Train while we hit our patience level and trigger early stopping or we reach max epoch number
    while cnt<patience and epoch<max_epochs:

        # Use batcheterator for mini-batch SGD
        batcheterator=iter_minibatches(10000,X_tr,y_tr)
        min_loss=np.min(improvement) # Note min_loss for each epoch
        for x_chunk,y_chunk in batcheterator:
            clf.partial_fit(x_chunk,y_chunk,classes=np.unique(y_train))
        improvement.append(metrics.log_loss(y_cv,clf.predict_proba(X_cv)))
        curr_loss=improvement[-1]
        
        if (min_loss-curr_loss)>tol:
            # If current loss is less than the current minimum loss then this is our best model so far

            np.save(path+'/best_prediction.npy',[val[1] for val in clf.predict_proba(X_test)])
            cnt=0
        else:
            # Model did not improve
            cnt+=1

        epoch+=1
    score+=np.min(improvement)
print('score:',score/9)

## 6. Create submission file in the Submissions folder

In [None]:
# Read submission format file
submission_format=pd.read_csv('../../Data/Original/SubmissionFormat.csv')
# Save index for submission
index=(submission_format[submission_format.columns[0]].tolist())
submission_format=submission_format[submission_format.columns[1:]]

# Create submission dictionary
path='best_model/'
sub_dict={}
for col in submission_format.columns:
    sub_dict[col]=np.load(path+col+'/best_prediction.npy')
    
# Make submission pandas and save it as csv file
sub_df=pd.DataFrame(sub_dict,columns=submission_format.columns,index=index)
sub_df.to_csv('../../Data/Submissions/logistic_104_models_target_encoding_loo_8.csv',index=True)

In [None]:
############
### Here make submission file for 9 models if required ###
### Get the code for that from the last cell ###
#############

## 6. Things that didn't work!

In [None]:
'''
1) Use condensed text feature

1.1 To load condensed text feature

    train_text=pd.read_csv(path+'train_condensed_text.csv')
    test_text=pd.read_csv(path+'test_condensed_text.csv')

2) Use original numeric data

2.1 if want load original numeric data from the train and test csv files from the extracted folder

    train_num=pd.read_csv(path+'train.csv').select_dtypes(exclude='object')
    train_num.fillna(train_num.median(),inplace=True)

    test_num=pd.read_csv(path+'test.csv').select_dtypes(exclude='object')
    test_num.fillna(test_num.median(),inplace=True)
    

2.2 to scale original numeric features normalization and standardization

    # Nomalize train and test data sets
    train_num=preprocessing.normalize(train_num)
    test_num=preprocessing.normalize(test_num)

    # scale and shift the data set with standard scaler
    std=preprocessing.StandardScaler()
    train_num=std.fit_transform(train_num)
    test_num=std.transform(test_num)
    

3) load binned numeric data

3.1 train_num=pd.read_csv(path+'train_binned_numeric_data.csv')
    test_num=pd.read_csv(path+'test_binned_numeric_data.csv')

4) Vectorize single column text vector

4.1 vectorizer=TfidfVectorizer(ngram_range=(1,4),min_df=10)
    train_csr.append(vectorizer.fit_transform(train_text.values.ravel()))
    test_csr.append(vectorizer.transform(test_text.values.ravel()))


5) Use entropy based features
    
5.1 if want to load entrop features read features 

    path='../../Data/Extracted/'
    train_entropy=pd.read_csv(path+'train_feature_label_entropy.csv')
    test_entropy=pd.read_csv(path+'test_feature_label_entropy.csv')
    
5.2 Then add following in the prediction cell

    # Create new X_train (currently list) for a current label
    X_train=train_csr.copy()
    
    # Append numpy array of the entropy based features for the current label
    X_train.append(np.array(train_entropy[list(filter(lambda val:val.startswith(label),train_entropy.columns))]))
    
    # Convert X_train into a sparse matrix after hstacking all elements of the list
    X_train=scipy.sparse.hstack(X_train).tocsr()

5.3 and for test add

    # Create new X_test (currently list) for this label
    X_test=test_csr.copy()
    # Same as for training set, add entropy based features for the current label into our list
    X_test.append(np.array(test_entropy[list(filter(lambda val:val.startswith(label),test_entropy.columns))]))
    # Convert X_test list into sparse matrix after hstacking all elements of the list
    X_test=scipy.sparse.hstack(X_test).tocsr()
    
6) Target encoding

    path='../../Data/Extracted/'

    with open(path+'train_cat_feat_target_encoding.pickle', 'rb') as f:
        train_feat_enc=pickle.load(f)

    with open(path+'test_cat_feat_target_encoding.pickle', 'rb') as f:
        test_feat_enc=pickle.load(f)

    for label in labels.columns:
        train_csr.append(train_feat_enc[label])
        test_csr.append(test_feat_enc[label])

7) Submission for 9 models

    submission_format=pd.read_csv('../../Data/Original/SubmissionFormat.csv')
    # Save index for submission
    index=(submission_format[submission_format.columns[0]].tolist())
    submission_format=submission_format[submission_format.columns[1:]]

    path='best_model'
    best_predictions=[]
    for label in os.listdir(path):
        best_predictions.append(np.load(path+'/'+label+'/best_prediction.npy'))

    best_predictions=np.hstack(best_predictions)

    logistic_submission=pd.DataFrame(best_predictions,columns=submission_format.columns,index=index)

    logistic_submission.to_csv('../../Data/Submissions/logistic_model_more_drop_feats_104_7.csv',index=True)
    
8) Adding target encoded features

    # Initialize train and test lists
    X_train=train_csr.copy()
    X_test=test_csr.copy()
    
    # Append target encoded features into X_train and X_test
    X_train.append(np.array(train_feat_enc[label]))
    X_test.append(np.array(test_feat_enc[label]))
    
    # Create sparse matrix after hstack
    X_train=scipy.sparse.hstack(X_train).tocsr()
    X_test=scipy.sparse.hstack(X_test).tocsr()


'''