# Step 1 Import packages and download nltk dictionaries

In [2]:
import pandas as pd
import numpy as np
# import libraries for text processing
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer
# import libraries for building model
from sklearn.cross_validation import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shikhatyagi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shikhatyagi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shikhatyagi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




## List of  helper functions

### 1) Function to read and subsample data 
read_and_subsample_data(infile,separator,subsample,restSample,videoSample,musicSample,bookSample)


In [3]:
def read_and_subsample_data(infile,separator,subsample,restSample,videoSample,musicSample,bookSample):
        
        train = pd.read_csv(infile,sep=separator)
        train.fillna('',inplace=True)
        if subsample == True:
            train_rest = train[train["label"]=="rest"].sample(frac=restSample).reset_index(drop=True)
            train_books = train[train["label"]=="books"].sample(frac=bookSample).reset_index(drop=True)
            train_music = train[train["label"]=="music"].sample(frac=musicSample).reset_index(drop=True)
            train_music = train[train["label"]=="videos"].sample(frac=videoSample).reset_index(drop=True)
            
            finalTrainData= pd.concat([train_rest,train_books,train_music,train_music])
            return finalTrainData
        else :
            return train

### 2) Function to keep only keys from  "additional attribute" column

In [4]:
def additional_attributes_key_filter(x):
        index=0
        tempString = "" 
        for word in x:
            if index%2==0:
                tempString = tempString + word +" "         
            index=index+1
        return (tempString)   

### 3) This function performs below preprocssing on text data - 
1. Remove stop words 
2. Remove words of length < 3
3. Lemmatize text 
4. Create dictionary of words appearing in text as key and number of occurances as value

In [5]:
def text_pre_processing(x):
        breadcrumbs_split_filter = []
        for word in x:
            if word.lower() not in stop_words:
                if len(word.strip()) >= 3 : 
                        breadcrumbs_split_filter.append(wordnet_lemmatizer.lemmatize(word.strip()))
        return Counter(breadcrumbs_split_filter)

### 4) Function to get combined key value pair across all columns


In [6]:
def combined_key_value_pair(attributeKeys):
    dict1 = {}
    for index in attributeKeys:
        for key in index:
            if key in dict1:
                dict1[key] = dict1[key]+1
            else:
                dict1[key] = 1
    return dict1
                

### 5) Function to filter infrequent keys from attribute dictionary

In [7]:
def combined_key_value_pair_filter(dict1,value_count_threshold):
    dictNew = {}
    for key in dict1:
        if dict1[key] > value_count_threshold:
            dictNew[key] = dict1[key]
    return dictNew
    

### 6) Function to filter infrequent keys from  additionalAttributes and breadcrumbs

In [8]:
def filterKeys(x,dictionary):
    newDict = {}
    for key in x:
        if key in dictionary:
            newDict[key] = 1
    return newDict

## Step 2 Read training data

In [9]:
pd.set_option('display.max_colwidth', -1)

path = "/Users/shikhatyagi/Downloads/"
trainFile = "train_data.csv"
train = read_and_subsample_data(path+trainFile,",",False,0.3,1,1,1)


wordnet_lemmatizer = WordNetLemmatizer()
# get stop word list and remove it from sentences 
stop_words = set(stopwords.words('english'))

finalData = train[["additionalAttributes","breadcrumbs","label"]]
print(finalData.shape)

(603201, 3)


## Step 3 Feature Engineering

### Get bag of words from additionalAttributes 

In [10]:
finalData["additionalAttributes_split"] = finalData["additionalAttributes"].str.split(";|=")
finalData["additionalAttributes_split1"] = finalData["additionalAttributes_split"].apply(additional_attributes_key_filter)
finalData["additionalAttributes_split2"] = finalData["additionalAttributes_split1"].str.replace(':|\\?|=|-|\\[|\\]|\\(|\\)|\\.|>|[0-9]|"|;|$|$|\\*|/|&|,|\\s+|\\\'s|\\\''," ").str.split(" ")
finalData["additionalAttributes_split3"] = finalData["additionalAttributes_split2"].apply(text_pre_processing)

additonalAttributeKeys =finalData["additionalAttributes_split3"].tolist()
#create consolidated dictionary 
additioanl_attribute_key_pair = combined_key_value_pair(additonalAttributeKeys)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [11]:
#Filter infrequent keys from consolidated dictionary 
additioanl_attribute_key_pair_filtered = combined_key_value_pair_filter(additioanl_attribute_key_pair,450)
# Filter keys from individual rows
finalData["additionalAttributes_split4"] = finalData["additionalAttributes_split3"].apply(filterKeys,dictionary=additioanl_attribute_key_pair_filtered)

aATList = finalData["additionalAttributes_split4"].tolist()
print("List conversion complete")

# Convert list to dataframe where each key will appear as feature
aATDF = pd.DataFrame(aATList)
# Fill Nan with 0
aATDF.fillna(0,inplace=True)

List conversion complete


In [20]:
aATDF.head()

Unnamed: 0,ASIN,Actors,Age,Amazon,April,Artist,Arts,Aspect,Assembled,Audience,...,format,label,manufacturer_part_number,minute,page,rank,screen_format,size,tape,weight
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Get bag of words from breadcrumbs 

In [21]:
finalData["breadcrumbs_split"] = finalData["breadcrumbs"].str.replace(':|\\?|=|-|\\[|\\]|\\(|\\)|\\.|>|[0-9]|"|;|$|$|\\*|/|&|,|\\s+|\\\'s|\\\''," ").str.split(" ")
finalData["breadcrumbs_split1"] = finalData["breadcrumbs_split"].apply(text_pre_processing)
breadcrumbsKeys =finalData["breadcrumbs_split1"].tolist()
breadcrumbs_key_pair = combined_key_value_pair(breadcrumbsKeys)
#Filter infrequent keys from consolidated dictionary 

In [22]:
breadcrumbs_key_pair_filtered = combined_key_value_pair_filter(breadcrumbs_key_pair,2550)
# Filter keys from individual rows
finalData["breadcrumbs_split2"] = finalData["breadcrumbs_split1"].apply(filterKeys,dictionary=breadcrumbs_key_pair_filtered)

# Convert breadcrumbs_split1 column to list of dictionary
brCList = finalData["breadcrumbs_split2"].tolist()
# Convert list to dataframe where each key will appear as feature
brCDF = pd.DataFrame(brCList)
brCDF.fillna(0,inplace=True)

In [24]:
brCDF.shape

(603201, 219)

### Combine breadcrumbs and additionalAttributes bag of words

In [25]:
different_attribute_list = list(set(aATDF.columns.tolist()) - set(brCDF.columns.tolist()))
different_attribute_df = aATDF[different_attribute_list]
final_train_df = pd.concat([different_attribute_df,brCDF],axis=1)
print(final_train_df.shape)

(603201, 420)


### Encode categorical class label to numeric

In [26]:
# Import label encoder to encode categorical label to numeric values
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(finalData["label"].tolist())
le1=le.transform(finalData["label"].tolist()) 

### Train and test split

In [27]:
x_train, x_valid, y_train, y_valid = train_test_split(final_train_df, le1, test_size=0.20, random_state=0)
print('Train samples: {} Validation samples: {}'.format(len(x_train), len(x_valid)))

Train samples: 482560 Validation samples: 120641


## Step 4 - Model Training using XGBOOST multi class classifier

### 1) XGBOOST parameter initialization

In [40]:
# Set xgboost parameters
params = {}
params['objective'] = "multi:softprob"
params['eta'] = 0.008
params['silent'] = True
params['max_depth'] = 4
params['subsample'] = 0.9
params['colsample_bytree'] = 0.9
params['colsample_bylevel']=0.9
params['min_child_weight']=5
params['n_estimators']=500
params['learning_rate']=0.01
params['n_jobs']=-1
params['nthread'] = 24
params['num_class'] = 4

### 2) Fit model using above parameters

In [41]:
# Initialize XGBOOST model
model = XGBClassifier(**params)

# Train XGBOOST model
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
       colsample_bytree=0.9, eta=0.008, gamma=0, learning_rate=0.01,
       max_delta_step=0, max_depth=4, min_child_weight=5, missing=None,
       n_estimators=500, n_jobs=-1, nthread=24, num_class=4,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.9)

### 3) Cross validation using GridSearchCV - not used due to longer processing time
#clf = GridSearchCV(model, param_grid= params,scoring='accuracy',verbose=2, refit=True,cv =5)
#clf.fit(x_train,y_train)

## Step 5 - Check accuracy

In [None]:
#print('Best score for data1:', clf.best_score_)

In [44]:
print("Accuracy score on training data", accuracy_score(model.predict(x_train),y_train))
print("Accuracy score on test data", accuracy_score(model.predict(x_valid),y_valid))

Accuracy score on training data 0.999747181698
Accuracy score on test data 0.999792773601


## Step 6 -  Generate predictions on evaluation data

### Read evaluation data

In [81]:
path = "/Users/shikhatyagi/Downloads/"
evaluation_file = "evaluation.csv"
test = read_and_subsample_data(path+evaluation_file,",",False,0.3,1,1,1)
test_data = test[["additionalAttributes","breadcrumbs","id"]]
test_data.fillna('',inplace=True)
print(test_data.shape)

(442041, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


### Feature Engineering similar to previous steps

In [82]:
train_data_column = final_train_df.columns

### Bag of words creation for additionalAttributes

In [83]:
test_data["additionalAttributes_split"] = test_data["additionalAttributes"].str.split(";|=")
test_data["additionalAttributes_split1"] = test_data["additionalAttributes_split"].apply(additional_attributes_key_filter)
test_data["additionalAttributes_split2"] = test_data["additionalAttributes_split1"].str.replace(':|\\?|=|-|\\[|\\]|\\(|\\)|\\.|>|[0-9]|"|;|$|$|\\*|/|&|,|\\s+|\\\'s|\\\''," ").str.split(" ")
test_data["additionalAttributes_split3"] = test_data["additionalAttributes_split2"].apply(text_pre_processing)
additional_attribute_keys = test_data["additionalAttributes_split3"].tolist()

list_new = []
for ele in  additional_attribute_keys:
    dict_new = {}
    for key in ele:
        if key  in train_data_column:
            dict_new[key] = ele[key]    
    list_new.append(dict_new)
    
test_df1 = pd.DataFrame(list_new)
test_df1.fillna(0,inplace=True)

print(test_df1.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Bag of words creation for breadcrumbs

In [95]:
test_data["breadcrumbs_split"] = test_data["breadcrumbs"].str.replace(':|\\?|=|-|\\[|\\]|\\(|\\)|\\.|>|[0-9]|"|;|$|$|\\*|/|&|,|\\s+|\\\'s|\\\''," ").str.split(" ")
test_data["breadcrumbs_split1"] = test_data["breadcrumbs_split"].apply(text_pre_processing)
breadcrumbs_keys =test_data["breadcrumbs_split1"].tolist()

list_new1 = []
for ele in  breadcrumbs_keys:
    dict_new = {}
    for key in ele:
        if key  in train_data_column:
            dict_new[key] = ele[key]    
    list_new1.append(dict_new)
    
test_df2 = pd.DataFrame(list_new1)
test_df2.fillna(0,inplace=True)

### Keep unique columns in the same order as of training data

In [106]:
different_attribute_list = list(set(test_df1.columns.tolist()) - set(test_df2.columns.tolist()))
different_attribute_df = test_df1[different_attribute_list]
final_test_df = pd.concat([different_attribute_df,test_df2],axis=1)
print(final_test_df.shape)

(442041, 420)


### Get predictions on evaluation data using trained mocel

In [107]:
validation_pred = model.predict(final_test_df[train_data_column])

## Write predictions to csv final
final_validation = pd.DataFrame()
final_validation["id"] = test_data["id"]
final_validation["class"] = le.inverse_transform(validation_pred)
final_validation.to_csv("/Users/shikhatyagi/Downloads/submissions.csv",sep=",",index=False)