In [1]:
import numpy as np
import pandas as pd
import scipy
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import gc
import os
import shutil

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

## 1. Read data

In [2]:
path='../../Data/Extracted/'

############
### Here load condensed text data if required ###
### Get the code for that from the last cell ###
#############

train_text=pd.read_csv(path+'train_multi_column_text.csv')
test_text=pd.read_csv(path+'test_multi_column_text.csv')

train_num=pd.read_csv(path+'train.csv').select_dtypes(exclude='object')
train_num.fillna(train_num.median(),inplace=True)

test_num=pd.read_csv(path+'test.csv').select_dtypes(exclude='object')
test_num.fillna(test_num.median(),inplace=True)

############
### Here load binned numeric data if required ###
### Get the code for that from the last cell ###
#############

labels=pd.read_csv(path+'labels.csv')
labels=pd.get_dummies(labels,prefix_sep='__')

print(train_num.shape, train_text.shape, test_num.shape, test_text.shape)

  interactivity=interactivity, compiler=compiler, result=result)


(400277, 2) (400277, 7) (50064, 2) (50064, 7)


## 2. Scaling numeric data sets

In [3]:
scaler=preprocessing.MinMaxScaler()
train_num=scaler.fit_transform(train_num)
test_num=scaler.transform(test_num)

############
### Here tfidf vectorize single column text vector if required ###
### Get the code for that from the last cell ###
############

## 3. Vectorize train and test text by TfidfVectorizer

In [4]:
train_csr=[train_num]
test_csr=[test_num]

############
### Here tfidf vectorize single column text vector if required ###
### Get the code for that from the last cell ###
############


for feat in tqdm(train_text.columns):
    vectorizer=TfidfVectorizer(ngram_range=(1,4),min_df=10)
    train_csr.append(vectorizer.fit_transform(train_text[feat].values.ravel()))
    test_csr.append(vectorizer.transform(test_text[feat].values.ravel()))

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




## 4. ECC (Ensemble Classifier Chains)

In [5]:
from sklearn import utils, linear_model, model_selection, metrics
from sklearn.feature_selection import SelectKBest, chi2
import os

###### Clean files from the previous session #####
#shutil.rmtree('./best_model')

########### Data Sets ###############
# Sparse data set ready for model
X_train=scipy.sparse.hstack(train_csr).tocsr()
X_test=scipy.sparse.hstack(test_csr).tocsr()

############ iterator ##################
#https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/37753#
def iter_minibatches(chunksize, X,y):
    # Provide chunks one by one
    chunk_start_marker = 0
    while chunk_start_marker < X.shape[0]:
        chunkrows = range(chunk_start_marker, min(chunk_start_marker + chunksize,X.shape[0]))
        # you need to implement "getrows" (based on what your data source is - RAM/Disc/...)
        X_chunk, y_chunk = X[chunkrows],y[chunkrows]
        yield X_chunk, y_chunk # get next portion of data to train on
        chunk_start_marker += chunksize # update starting point


########################### ECC Implementation ###########################

columns=np.array(labels.columns)
for cc_num in tqdm(range(40,100)):
    
    # Random subset of data set
    X_train_sample,labels_train_sample=utils.resample(X_train,labels,n_samples=int(np.floor((X_train.shape[0])*0.2)),random_state=cc_num)


    # random sequence of columns
    np.random.shuffle(columns)
    
    # Create new copy of X_train, X_test, and labels for new chain
    X_te=X_test.copy()
    #X_train_copy=X_train.copy()
    #labels_tr=labels.copy()
    
    for ind,label in enumerate(columns):
        
        min_samples=2
        for i in labels_train_sample[label].unique():
            min_samples=min(labels_train_sample[label][labels_train_sample[label]==i].shape[0], min_samples)
        
        if min_samples>1:
            # Split with stratify
            X_tr,X_cv,y_tr,y_cv=model_selection.train_test_split(X_train_sample,labels_train_sample[label].values,stratify=labels_train_sample[label],test_size=0.2,random_state=44)
        else:
            # Split without stratify
            X_tr,X_cv,y_tr,y_cv=model_selection.train_test_split(X_train_sample,labels_train_sample[label].values,test_size=0.2,random_state=44)
        
        ############
        ### Here do feature selection and transformation if required ###
        ### Get the code for that from the last cell ###
        ############
        
        # Path to store best model while training and early stopping
        path='./best_model/'+str(cc_num)+'/'+label
        if not os.path.exists(path):
            os.makedirs(path)

        # model to be train
        clf=linear_model.SGDClassifier(loss='log',
                                     eta0=0.001,
                                     validation_fraction=0.2,
                                     early_stopping=False,
                                     n_jobs=-1) # Estimator

        improvement=[10e10] # Track cv score while fitting the model
        epoch=0
        cnt=0
        patience=10 # Number of epochs to wait without improvement in cv loss
        max_epochs=1000
        tol=0.001

        # Train while we hit our patience level and trigger early stopping or we reach max epoch number
        while cnt<patience and epoch<max_epochs:

            # Use batcheterator for mini-batch SGD
            batcheterator=iter_minibatches(10000,X_tr,y_tr)
            min_loss=np.min(improvement) # Note min_loss for each epoch
            for x_chunk,y_chunk in batcheterator:
                # Note: classes argument must get number of classes from the entire data set
                clf.partial_fit(x_chunk,y_chunk,classes=np.unique(labels[label]))
            improvement.append(metrics.log_loss(y_cv,clf.predict_proba(X_cv),labels=np.unique(labels[label])))
            curr_loss=improvement[-1]

            if (min_loss-curr_loss)>tol:
                # If current loss is less than the current minimum loss then this is our best model so far
                #best_train_prediction=clf.predict(X_tr)
                #best_cv_prediction=clf.predict(X_cv)
                best_test_prediction=np.array(list(map(lambda val:val[1],clf.predict_proba(X_te))))
                
                np.save(path+'/best_prediction.npy',best_test_prediction)
                
                cnt=0
            else:
                # Model did not improve
                cnt+=1

            epoch+=1
        
        ########## Augment X_train_sample, and X_te #############
        best_train_prediction=clf.predict(X_train_sample)
        X_train_sample=scipy.sparse.hstack([X_train_sample,best_train_prediction.reshape(-1,1)]).tocsr()
        
        X_te=scipy.sparse.hstack([X_te,best_test_prediction.reshape(-1,1)]).tocsr()

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))




## 5. Create final submission data frame and save it as csv file into Submissions folder

In [6]:
# Create submission data frame according to appropriate submission format
path='../../Data/'
submission_format=pd.read_csv(path+'Original/SubmissionFormat.csv')

sub_columns=submission_format.columns[1:]
sub_index=submission_format[submission_format.columns[0]].values

del submission_format
gc.collect()

# Create final submssions dictionary
path='best_model/'
#final_preds={}
final_label_pred=[]
for label in tqdm(sub_columns):
    cc_preds=[]
    for cc_num in os.listdir(path):
        cc_preds.append(np.load(path+cc_num+'/'+label+'/best_prediction.npy'))
    preds_stack=np.vstack(cc_preds).T
    
    quartiles=np.quantile(preds_stack,[0.25,0.75],axis=1).T
    
    label_preds=[]
    for ind,arr in enumerate(preds_stack):
        label_preds.append(np.mean(arr[(arr>=quartiles[ind][0]) & (arr<=quartiles[ind][1])]))
        
    final_label_pred.append(label_preds)

# Create submission data frame
sub_df=pd.DataFrame(dict(zip(sub_columns,final_label_pred)),index=sub_index)

# Save submission file into submissions folder
path='../../Data/'
sub_df.to_csv(path+'Submissions/ECC_8.csv',index=True)

HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))




In [7]:
sub_df

Unnamed: 0,Function__Aides Compensation,Function__Career & Academic Counseling,Function__Communications,Function__Curriculum Development,Function__Data Processing & Information Services,Function__Development & Fundraising,Function__Enrichment,Function__Extended Time & Tutoring,Function__Facilities & Maintenance,Function__Facilities Planning,...,Student_Type__Special Education,Student_Type__Unspecified,Use__Business Services,Use__ISPD,Use__Instruction,Use__Leadership,Use__NO_LABEL,Use__O&M,Use__Pupil Services & Enrichment,Use__Untracked Budget Set-Aside
180042,0.008811,0.002826,0.000515,0.001319,0.006074,0.000504,0.003750,0.002233,0.023443,0.000672,...,0.004920,0.862879,0.004507,0.006751,0.795404,0.003975,0.068580,0.024490,0.003900,0.007438
28872,0.004690,0.016844,0.001919,0.010011,0.019245,0.001010,0.068332,0.040127,0.026494,0.001691,...,0.014906,0.916989,0.013810,0.040100,0.130022,0.076273,0.034470,0.033688,0.143141,0.002421
186915,0.083634,0.005885,0.001524,0.006020,0.009422,0.001311,0.003506,0.011389,0.009045,0.002225,...,0.029813,0.327082,0.003111,0.016972,0.801747,0.011667,0.038330,0.002983,0.010519,0.005808
412396,0.083421,0.005866,0.001526,0.006008,0.009400,0.001312,0.003494,0.011412,0.009034,0.002226,...,0.029903,0.323090,0.003106,0.016949,0.804030,0.011644,0.038078,0.002968,0.010519,0.005803
427740,0.003114,0.028040,0.002619,0.003487,0.010973,0.001274,0.012281,0.002069,0.013272,0.001721,...,0.007967,0.980733,0.023548,0.023888,0.008006,0.799655,0.013729,0.013196,0.017389,0.002190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169063,0.026735,0.011008,0.000292,0.000718,0.006718,0.000290,0.001667,0.001057,0.019586,0.000254,...,0.001768,0.018747,0.001215,0.003050,0.213402,0.006920,0.637095,0.000783,0.006519,0.000309
433255,0.026733,0.011008,0.000292,0.000718,0.006718,0.000290,0.001667,0.001057,0.019586,0.000254,...,0.001768,0.018747,0.001215,0.003050,0.213393,0.006920,0.637098,0.000783,0.006519,0.000309
232204,0.026765,0.011007,0.000292,0.000717,0.006715,0.000290,0.001663,0.001055,0.019581,0.000254,...,0.001766,0.018800,0.001215,0.003049,0.213620,0.006923,0.636663,0.000784,0.006514,0.000309
171685,0.026733,0.011008,0.000292,0.000718,0.006718,0.000290,0.001667,0.001057,0.019586,0.000254,...,0.001768,0.018747,0.001215,0.003050,0.213391,0.006920,0.637099,0.000783,0.006519,0.000309


In [None]:
'''
1) Feature scaling normalize and standardize

    # Nomalize train and test data sets
    train_num=preprocessing.normalize(train_num)
    test_num=preprocessing.normalize(test_num)

    # scale and shift the data set with standard scaler
    std=preprocessing.StandardScaler()
    train_num=std.fit_transform(train_num)
    test_num=std.transform(test_num)

2) Use this code for train and cv data set split for each chain

    # Random subset of data set
    X_tr_sample,y_tr_sample=utils.resample(X_train,labels,n_samples=int(np.floor((X_train.shape[0])*0.4)),random_state=cc_num)
    
    # train test split to evaluate model performance on cv data set to decide early stopping
    train_ind,cv_ind=model_selection.train_test_split(range(X_tr_sample.shape[0]),test_size=0.2,random_state=44)
    
    # Create train and cv sets
    X_tr,y_tr_df=X_tr_sample[train_ind,:],y_tr_sample.iloc[train_ind]
    X_cv,y_cv_df=X_tr_sample[cv_ind,:],y_tr_sample.iloc[cv_ind]
    
3) Feature selection

    # Select 5000 features for current classifier
    feat_select=SelectKBest(score_func=chi2,k=5000)

    # transform train, cv, te set only for this classifier
    X_tr=feat_select.fit_transform(X_tr,y_tr)
    X_cv=feat_select.transform(X_cv)
    X_te_temp=feat_select.transform(X_te)


'''