In [2]:
#loading the .csv file into an SFrame
import graphlab
import pandas
products_pd=pandas.read_csv('training-samples.csv',quotechar='"',encoding='utf-16')


#doing the necessary changes to the dataset

products_pd.loc[products_pd['gender']=='plus size' , 'gender'] ='womens'#'plus size' is incorporated into 'womens'
pd=products_pd.fillna(value='None')
products=graphlab.SFrame(data=pd)#converting pandas.dataframe to SFrame
products=products[products['gender']!='kids']#kids is removed from dataset
products=products[products['gender']!='None']#none is removed from dataset
products=products[products['gender']!='boys']#boys is removed from dataset
products=products[products['gender']!='baby girls']#baby girls is removed from dataset
products=products[products['gender']!='baby boys']#baby boys is removed from dataset

In [3]:
#function to remove punctuation from all text
def remove_punc(str):
    import string
    return str.translate(None, string.punctuation)

#function to remove numbers from all text
def remove_numbers(str):
    import string
    return str.translate(None, string.digits) 
    
features=['category','sub_category','sub_sub_category','description','gender','title']

In [4]:
#pre-processing of the training data

#removing numbers and punctuation from text
for f in features:
    products[f]=products[f].apply(remove_punc)
    products[f]=products[f].apply(remove_numbers)
    
#develop wordcount column by concatenating 'description' and 'title'
products['wordcount'] = graphlab.text_analytics.count_words(products['description']+products['title'])

#exclude stopwords count
products['wordcount'] = products['wordcount'].dict_trim_by_keys(graphlab.text_analytics.stopwords(), exclude=True)

#keep only words whose count >=2
products['wordcount'] = products['wordcount'].dict_trim_by_values(2)


In [5]:
#obtaining train and validation data from the training set

train_data,test_data = products.random_split(.8, seed=0)

In [6]:
#developing the logistic classifier model

logistic_classifier_model = graphlab.logistic_classifier.create(train_data,
                                    target='gender',
                                    features=['wordcount','category','sub_category','sub_sub_category'],
                                    validation_set=test_data, l1_penalty=30, l2_penalty=30, max_iterations=50)

In [7]:
#loading the test set and cleaning it

test=graphlab.SFrame.read_json('test_dataset.jsonl', orient ='lines')

test=test[test['gender']!='baby boys']#none is removed from dataset
test=test[test['gender']!='baby girls']#none is removed from dataset
test=test[test['gender']!='boys']#none is removed from dataset

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [8]:
#pre-processing test dataset

for f in features:
    test[f]=test[f].apply(remove_punc)
    test[f]=test[f].apply(remove_numbers)
    
test['wordcount'] = graphlab.text_analytics.count_words(test['description']+test['title'])

test['wordcount'] = test['wordcount'].dict_trim_by_keys(graphlab.text_analytics.stopwords(), exclude=True)

test['wordcount'] = test['wordcount'].dict_trim_by_values(2)


In [9]:
#classifying and evauating the results

predictions = logistic_classifier_model.classify(test)
results = logistic_classifier_model.evaluate(test)

In [10]:
graphlab.canvas.set_target('ipynb') #setting up visualisation 
logistic_classifier_model.show(view='Evaluation')#evaluating the model on test set