In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

import os
import sys

src_dir = os.path.join(os.getcwd(), 'box-plots-sklearn-master/', 'src')
sys.path.append(src_dir)

from data.multilabel import multilabel_sample_dataframe, multilabel_train_test_split
from features.SparseInteractions import SparseInteractions
from models.metrics import multi_multi_log_loss

In [2]:
df = pd.read_csv('dataset/TrainingData.csv', index_col=0)
df.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,...,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,...,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,...,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,...,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,...,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,...,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400277 entries, 134338 to 415831
Data columns (total 25 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Function                400277 non-null  object 
 1   Use                     400277 non-null  object 
 2   Sharing                 400277 non-null  object 
 3   Reporting               400277 non-null  object 
 4   Student_Type            400277 non-null  object 
 5   Position_Type           400277 non-null  object 
 6   Object_Type             400277 non-null  object 
 7   Pre_K                   400277 non-null  object 
 8   Operating_Status        400277 non-null  object 
 9   Object_Description      375493 non-null  object 
 10  Text_2                  88217 non-null   object 
 11  SubFund_Description     306855 non-null  object 
 12  Job_Title_Description   292743 non-null  object 
 13  Text_3                  109152 non-null  object 
 14  Text_4         

In [4]:
df.describe()

Unnamed: 0,FTE,Total
count,126071.0,395722.0
mean,0.426794,13105.86
std,0.573576,368225.4
min,-0.087551,-87466310.0
25%,0.000792,73.7977
50%,0.130927,461.23
75%,1.0,3652.662
max,46.8,129700000.0


In [5]:
# Get the columns name
labels = df.columns[:9].to_list()
feature_labels = [c for c in df.columns if c not in labels]

# Sampled the dataset
sample_size = 10000

sampling = multilabel_sample_dataframe(df, 
                                       pd.get_dummies(df[labels]),
                                       min_count=25,
                                       size=sample_size,
                                       seed = 43)

dummy_labels = pd.get_dummies(sampling[labels])

X_train, X_test, y_train, y_test = multilabel_train_test_split(sampling[feature_labels],
                                                               dummy_labels,
                                                               size=0.2,
                                                               min_count=3,
                                                               seed=43)

In [6]:
print('Training shape: {}/{}'.format(X_train.shape, y_train.shape))
print('Test shape: {}/{}'.format(X_test.shape, y_test.shape))

Training shape: (8000, 16)/(8000, 104)
Test shape: (2000, 16)/(2000, 104)


In [7]:
numeric_columns = ['FTE', 'Total']

def combine_text_columns(dataframe, to_drop=numeric_columns + labels):
    
    to_drop = set(to_drop) & set(dataframe.columns.to_list())
    textdata = dataframe.drop(to_drop, axis=1)
    
    textdata.fillna('', inplace=True)
    
    return textdata.apply(lambda x: ' '.join(x), axis=1)

In [8]:
from sklearn.preprocessing import FunctionTransformer

get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_num_data = FunctionTransformer(lambda x: x[numeric_columns], validate=False)

In [9]:
get_text_data.fit_transform(sampling.head())

38     OTHER PURCHASED SERVICES  SCHOOL-WIDE SCHOOL P...
70     Extra Duty Pay/Overtime For Support Personnel ...
198    Supplemental *  Operation and Maintenance of P...
209    REPAIR AND MAINTENANCE SERVICES  PUPIL TRANSPO...
614     GENERAL EDUCATION LOCAL EDUCATIONAL AIDE,70 H...
dtype: object

In [10]:
get_num_data.fit_transform(sampling.head())

Unnamed: 0,FTE,Total
38,,653.46
70,,2153.53
198,,-8291.86
209,,618.29
614,0.71,21747.666875


In [11]:
from sklearn.metrics import make_scorer

log_loss_scorer = make_scorer(multi_multi_log_loss)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

from sklearn.feature_selection import chi2, SelectKBest

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler

from sklearn.pipeline import Pipeline, FeatureUnion

token_aplhanumeric = '[A-Za-z0-9]+(?=\\s+)'

In [13]:
%%time

chi_k = 300

pl = Pipeline([
    ('Union', FeatureUnion(
                transformer_list = [
                    ('numeric_features', Pipeline([
                        ('selector', get_num_data),
                        ('imp', SimpleImputer())
                    ])),
                    ('text_features', Pipeline([
                        ('selector', get_text_data),
                        ('vectorizer', HashingVectorizer(token_pattern = token_aplhanumeric,
                                                         norm=None, binary=False, alternate_sign=False,
                                                         ngram_range = (1,2))),
                        ('dim_red', SelectKBest(chi2, chi_k))
                    ]))
                ]
    )),
    ('int', SparseInteractions(degree=2)),
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])

pl.fit(X_train, y_train)

print('Logloss score of tranied pipeline: ', log_loss_scorer(pl, X_test, y_test.values))

Logloss score of tranied pipeline:  2.7731086422810307
CPU times: user 8min 53s, sys: 6min 59s, total: 15min 53s
Wall time: 5min 15s


In [15]:
# Predict the holdout dataset
holdout = pd.read_csv('dataset/TestData.csv', index_col=0)

predict = pl.predict_proba(holdout)