In [1]:
import csv
import pandas as pa
import numpy as np 
import scipy
pa.options.mode.chained_assignment = None
import sys  
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import LabelEncoder

reload(sys)  
sys.setdefaultencoding('utf8')

from nltk.tokenize import WordPunctTokenizer
w = WordPunctTokenizer()

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

from nltk import stem
english_stemmer = stem.SnowballStemmer('english')


#### Using Pandas to Load Training and Test Data

In [2]:
train = pa.read_csv("./data/train_data.csv", sep='\t', header=0)
test = pa.read_csv("./data/xtest_data.csv", sep='\t', header=0)

#### There are many NaN values in the dataset, so instead of leaving them blank I filled them with Zeros. 

In [3]:
train = train.fillna("0")
test = test.fillna("0")

#### Creating a binary label for the label column which we have to predict and then dropping it from the training data frame.

In [4]:
ytrain = train["type"].map({"ELECTRICAL":1}).fillna(0)

In [5]:
train = train.drop(["type"], axis = 1)

#### Feature 1: By eye balling the data set I noticed that the type ELECTRICAL is directly associated with the license type ELECTRICAL CONTRACTOR LICENSE so I created a feature here I gave 1 eight to al columns containing this value and rest 0.

In [6]:
train["f1"] = 0
train["f1"][train["licensetype"] == "ELECTRICAL CONTRACTOR LICENSE"] = 1

test["f1"] = 0
test["f1"][test["licensetype"] == "ELECTRICAL CONTRACTOR LICENSE"] = 1


#### Feature 2 and 3: There is a preprocessing step for the below features for which the code is contained in extract_electrical_words.py. This file extracts unique words related with the 'type' ELECTRICAL for columns 'licensetype', 'businessname',  'description' and 'legaldescription'. This only gives us the words that are in these columns and are related with only electrical and no other type. The below code loads thos words into separate lists. There are no unique words for 'legaldescription' and 'licensetype' so we are not loading it.

#### Then we create features for 'description' and 'businessname' and assign a value 1 if the row contains any of those electrical words otherwise it is given a 0 value.

In [7]:
descWords = [line.strip() for line in open("./data/ele_w_desc.txt", 'r')]
# descPattern = '|'.join(descWords).lower()
busWords = [line.strip() for line in open("./data/ele_w_bus.txt", 'r')]
# busPattern = '|'.join(busWords).lower()

In [8]:
def checkDesWords(tempstr, cList):
    wlist = [word.lower() for word in w.tokenize(tempstr) if (word.isalpha() and word not in stopwords)]
    if any(word in wlist for word in cList):
        return True
    
train["f2"] = 0
train["f2"] = train["description"].map(lambda x: 1 if checkDesWords(x, descWords) else 0)
# train["f2"][train["description"].str.contains(descPattern)] = 1

train["f3"] = 0
train["f3"] = train["businessname"].map(lambda x: 1 if checkDesWords(x, busWords) else 0)

train.job_value  = train.job_value.str.strip().str.lower().str.replace('$', '').str.replace(',','').astype(float)

test["f2"] = 0
test["f2"] = test["description"].map(lambda x: 1 if checkDesWords(x, descWords) else 0)
# train["f2"][train["description"].str.contains(descPattern)] = 1

test["f3"] = 0
test["f3"] = test["businessname"].map(lambda x: 1 if checkDesWords(x, busWords) else 0)

test.job_value  = test.job_value.str.strip().str.lower().str.replace('$', '').str.replace(',','').astype(float)

#### N-Gram extraction and Jaccard and DIce Distance metric code. We use them to extract more features.

In [9]:

#######################
## N-Gram Extraction ##
#######################
def getUnigram(words):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny']
        Output: a list of unigram
    """
    assert type(words) == list
    return words
    
def getBigram(words, join_string, skip=0):
    """
       Input: a list of words, e.g., ['I', 'am', 'Denny']
       Output: a list of bigram, e.g., ['I_am', 'am_Denny']
       I use _ as join_string for this example.
    """
    if type(words) == tuple:
        words = words[1]
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1,skip+2):
                if i+k < L:
                    lst.append( join_string.join([words[i], words[i+k]]) )
    else:
        # set it as unigram
        lst = getUnigram(words)
    return lst
    
def getTrigram(words, join_string, skip=0):
    """
       Input: a list of words, e.g., ['I', 'am', 'Denny']
        Output: a list of trigram, e.g., ['I_am_Denny']
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
            for k1 in range(1,skip+2):
                for k2 in range(1,skip+2):
                    if i+k1 < L and i+k1+k2 < L:
                        lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) )
    else:
        # set it as bigram
        lst = getBigram(words, join_string, skip)
    return lst
    

#####################
## Distance metric ##
#####################
def JaccardCoef(A, B):
    A, B = set(A), set(B)
    intersect = len(A.intersection(B))
    union = len(A.union(B))
    coef = try_divide(intersect, union)
    return coef

def DiceDist(A, B):
    A, B = set(A), set(B)
    intersect = len(A.intersection(B))
    union = len(A) + len(B)
    d = try_divide(2*intersect, union)
    return d

def compute_dist(A, B, dist="jaccard_coef"):
    if dist == "jaccard_coef":
        d = JaccardCoef(A, B)
    elif dist == "dice_dist":
        d = DiceDist(A, B)
    return d

def try_divide(x, y, val=0.0):
    """ 
    	Try to divide two numbers
    """
    if y != 0.0:
    	val = float(x) / y
    return val

#### Methods for Tokenizing and Stemming data and also removal of stop words

In [10]:

def stem_tokens(tokens, stemmer):
    stemmed = []
    for token in tokens:
        stemmed.append(stemmer.stem(token))
    return stemmed

def preprocess_data(line):
    tokens = [word.lower() for word in w.tokenize(line) if word.isalpha()]
    tokens_stemmed = stem_tokens(tokens, english_stemmer)
    tokens_stemmed = [x for x in tokens_stemmed if x not in stopwords]
    return tokens_stemmed

def preprocess(line):
    return " ".join(preprocess_data(line))

def get_position_list(target, obs):
    """
        Get the list of positions of obs in target
    """
    pos_of_obs_in_target = [0]
    if len(obs) != 0:
        pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
        if len(pos_of_obs_in_target) == 0:
            pos_of_obs_in_target = [0]
    return pos_of_obs_in_target

#### We apply the below feature engineering for on the columns description and legal description. The below Function extracts counting features using bigrams.  Count of n-gram count, Count & Ratio of Digit count & ratio of digits in description and legal description, Count & Ratio of Unique n-gram count & ratio of unique ngram for two of the columns.
####Description Missing Indicator binary indicator indicating whether di is empty.


In [11]:
def extract_count_feat(df):
    ## unigram
    print "generate unigram"
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["description"]), axis=1))
    df["legaldescription_unigram"] = list(df.apply(lambda x: preprocess_data(x["legaldescription"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    try:
        df["description_bigram"] = list(df.apply(lambda x: getBigram(x["description_unigram"], join_str), axis=1))
    except:
        templist = []
        for x in df["description_unigram"].iteritems():
            templist.append(getBigram(x, join_str))
        df["description_bigram"] = templist
    
    df["legaldescription_bigram"] = list(df.apply(lambda x: getBigram(x["legaldescription_unigram"], join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df["description_trigram"] = list(df.apply(lambda x: getTrigram(x["description_unigram"], join_str), axis=1))
    df["legaldescription_trigram"] = list(df.apply(lambda x: getTrigram(x["legaldescription_unigram"], join_str), axis=1))
   

    ################################
    ## word count and digit count ##
    ################################
    print "generate word counting features"
    feat_names = ["description", "legaldescription"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
            df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
            df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[str(feat_name+"_"+gram)])), axis=1))
            df["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)])

        ## digit count
        df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))
        df["ratio_of_digit_in_%s"%feat_name] = map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)])

    ## description missing indicator
    df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))
    df["legaldescription_missing"] = list(df.apply(lambda x: int(x["legaldescription_unigram"] == ""), axis=1))


    ##############################
    ## intersect word count ##
    ##############################
    
    #### Count & Ratio of a’s n-gram in b’s n-gram 
    print "generate intersect word counting features"
    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
                    df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)])



    ######################################
    ## intersect word position feat ##
    ######################################
    #### Statistics of Positions of a’s n-gram in b’s n-gram 
    #### For those intersect n-gram, I recorded their positions, and computed the following statistics as features. 
    #     – minimum value (0% quantile) 
    #     – median value (50% quantile) 
    #     – maximum value (100% quantile) 
    #     – mean value 
    #     – standard deviation (std) 
    #### Statistics of Normalized Positions of a’s n-gram in b’s n-gram 
    #### These features are similar with above features, but computed using positions normalized by the length of a.

    print "generate intersect word position features"
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                if target_name != obs_name:
                    pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
                    ## stats feat on pos
                    df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos)
                    df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos)
                    df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos)
                    df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos)
                    df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos)
                    ## stats feat on normalized_pos
                    df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)])

# Jaccard coefficient
# JaccardCoef(A,B) = |A∩B|/|A∪B|

# and Dice distance
# DiceDist(A,B) = 2|A∩B|/|A|+|B|

# are used as distance metrics, where A and B denote two sets respectively. 
# For each distance metric, two types of features are computed. 
def extract_basic_distance_feat(df):
    ## jaccard coef/dice dist of n-gram
    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["description", "legaldescription"]
    for dist in dists:
        for gram in grams:
            for i in range(len(feat_names)-1):
                for j in range(i+1,len(feat_names)):
                    target_name = feat_names[i]
                    obs_name = feat_names[j]
                    df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                            list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))

# The below function created TF-IDF matrix for the columns description and legal description. This may also be called 
# Bag of Words feature extraction.
                        
def vectorize(train, tfv_query=None):
    #TF-IDF Calculation 
    desc_data = list(train['description'].apply(preprocess))
    legdesc_data = list(train['legaldescription'].apply(preprocess))
    if tfv_query is None:
        tfv_query = TfidfVectorizer(min_df=3,  max_features=None,   
                strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
                stop_words = stopwords)

        full_data = desc_data + legdesc_data
        tfv_query.fit(full_data)   
    
    nonnumeric_columns = ['licensetype', 'businessname',  'subtype', 'description_unigram', 'legaldescription_unigram', 'legaldescription', 'description',
                'description_bigram', 'legaldescription_bigram', 'description_trigram', 'legaldescription_trigram']
    
    # XGBoost(discussed below) doesn't (yet) handle categorical features automatically, so we need to change
    # them to columns of integer values.
    # See http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing for more
    # details and options
    le = LabelEncoder()
    for feature in nonnumeric_columns:
        train[feature] = le.fit_transform(train[feature])
#     train = train.drop(drop_col, axis=1)
    csr_train = sparse.csr_matrix(train.values)
#     Hstack all features along with tfidf features.
    return sparse.hstack([tfv_query.transform(desc_data), tfv_query.transform(legdesc_data), csr_train]), tfv_query



#### Extract Training and test features using the above methods. Xtrain is our final training matrix and ytrain is the label being predicted. Xtest is our final test matrix.

In [12]:
extract_count_feat(train)
extract_basic_distance_feat(train)

extract_count_feat(test)
extract_basic_distance_feat(test)

Xtrain, tfv_query = vectorize(train)
Xtest, _ = vectorize(test, tfv_query)

xgb_model = XGBClassifier()  

#### We use xgboost(https://github.com/dmlc/xgboost) library created by Distributed (Deep) Machine Learning Community. It is an an optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. It implements machine learning algorithms under the Gradient Boosting framework, including Generalized Linear Model (GLM) and Gradient Boosted Decision Trees (GBDT).

In [13]:

# We use GridSearchCV from sklearn library to find the best parameter for the model. It takes a lot of time to predict 
# so after testing we got an accuracy of 99.4079% for tree size(max_depth) = 6,
# no of estimators = 500 and learning_rate =0.25.

#We have commented this code because it was for testing purpose and takes lot of time to process. If you want 
# to check the results this below part can be un-commented and checked again.

xgb_model = XGBClassifier() 
clf = GridSearchCV(xgb_model, 
                   {'max_depth': [6], 
                    'n_estimators': [500], #tried with 50, 100, 1000 as well but best 2 parameters are 200 and 500
                    'learning_rate': [0.25]}, verbose=1)
clf.fit(Xtrain,ytrain)
print clf.best_score_
print clf.best_params_


In [14]:
# The below code is another way of coing cross validation using xgboost's predevined cv funtion and we do a 10-fold
#  cross validation and the results will be given below.
 
dm = xgb.DMatrix(Xtrain, label=ytrain)
params = {'max_depth': 6, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
cv = xgb.cv(params, dm, num_boost_round=10, nfold=5, early_stopping_rounds=10)

In [15]:
print cv

In [16]:
# Final prediction model using the best parameters and predicting upon the test data

xgb_ = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators = 500).fit(Xtrain, ytrain)
ytest_pred = xgb_.predict(Xtest) 


In [17]:
np.savetxt("./data/ytest_pred.csv", ytest_pred, delimiter=",")