# Stackline - DS Assignment

### Import functions

In [230]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import ZeroPadding2D
from keras.layers import Convolution2D
import h5py
from keras import applications
from keras.utils.np_utils import to_categorical
from keras.models import Model
import urllib
import os
import math

## Exploratory Data Analysis

#### Reading Data

In [218]:
df_train = pd.read_excel('Training_Data_Assessment.xlsx')
df_test = pd.read_excel('Data To Classify_Assessment.xlsx')
categories = pd.read_excel('Categories_Assessment.xlsx',header=None,names=['CategoryName'])

In [219]:
df_train.head()

Unnamed: 0,ASIN,CategoryName,BrandName,Title,ImageUrl
0,B014FCC4NO,Headphones,01 Audio,"Bluetooth Headphones, Wireless Earbuds Earphon...",http://ecx.images-amazon.com/images/I/31KpP1yO...
1,B00RE20CVO,Cables,1byone,1Byone Lightning to USB Cable 3.28ft (1M) for...,http://ecx.images-amazon.com/images/I/31cldYZD...
2,B00HEZV6AC,Security & Surveillance,1byone,1byone? 7 Inch Colorful LCD Screen Video Doorb...,http://ecx.images-amazon.com/images/I/41by3Sjc...
3,B00ZWOU5R2,Streaming Media,1byone,1byone 5GHz Wireless HDMI Streaming Media Play...,https://images-na.ssl-images-amazon.com/images...
4,B00RFLXE0A,Television Accessories,1byone,1byone Shiny Antenna Super Thin Amplified HDTV...,http://ecx.images-amazon.com/images/I/314oPMta...


In [220]:
df_train.describe()

Unnamed: 0,ASIN,CategoryName,BrandName,Title,ImageUrl
count,6034,6034,6034,6034,6034
unique,6031,63,1480,6009,5774
top,B019ZY1ZWS,Keyboards,Samsung,Quze Adjustable Wooden Laptop Desk Notebook Co...,http://ecx.images-amazon.com/images/I/41KHRFUc...
freq,2,100,155,3,22


In [221]:
df_test.head()

Unnamed: 0,ASIN,BrandName,Title,ImageUrl
0,B005DIRI6I,Portta,Portta Digital Coaxial Toslink to Analog (L/R)...,http://ecx.images-amazon.com/images/I/01KGAAk9...
1,B000OYR9S8,Savage,Savage SV-107X12-56 Seamless Background Paper ...,http://ecx.images-amazon.com/images/I/01OWR5or...
2,B00WT9UV3Q,Avtech,AVTech AVS228 8CH HD-SDI DVR,http://ecx.images-amazon.com/images/I/11%2BWuk...
3,B010F69FRC,Dahua,Dahua NVR4416-P / EX-NVRDR-P Dual Core CPU - 1...,http://ecx.images-amazon.com/images/I/111McGzd...
4,B00U4S0FE4,HP,HP KVM Console G3 Switch 0x1x8 - 8 Ports - USB...,http://ecx.images-amazon.com/images/I/112CVCFg...


In [222]:
df_test.describe()

Unnamed: 0,ASIN,BrandName,Title,ImageUrl
count,57030,55911,57030,57030
unique,57030,6800,56564,54638
top,B00I1CPACM,HP,Misfit Shine - Activity and Sleep Monitor,http://g-ecx.images-amazon.com/images/G/01/x-s...
freq,1,1355,9,50


In [223]:
categories.head()

Unnamed: 0,CategoryName
0,Headphones
1,Cables
2,Security & Surveillance
3,Streaming Media
4,Television Accessories


In [224]:
unique_categories = categories['CategoryName']

**Observations:**

1. Training data: 6031 unique ASIN in 6034 rows. Need to remove duplicates.

2. Testing data: 57030 unique ASIN in 57030 rows. No duplicates found.

3. No Null values observed

#### Removing duplicate ASINs

In [225]:
df_train.drop_duplicates('ASIN',inplace=True)

#### Training and Validation Split

In [226]:
x_df_train, x_df_test, y_df_train, y_df_test = train_test_split(df_train[['Title','ImageUrl']],df_train[['CategoryName']],
train_size=.8, stratify=df_train['CategoryName'],random_state=30)

## Image Downloading

**Aim**

* To download training, validation and testing images from image url
* The images are stored as
        /train
            /category1
                train_image1.jpg
                train_image2.jpg
            /category2
                train_image1.jpg
                train_image2.jpg
        /validation
            /category1
                validation_image1.jpg
                validation_image2.jpg
            /category2
                validation_image1.jpg
                validation_image2.jpg
        /test
            /New folder
                test_image1.jpg
                test_image2.jpg


#### Path variables

In [None]:
current_path = os.getcwd()
train_path = os.path.join(current_path,'train')
validation_path = os.path.join(current_path,'validation')
test_path = os.path.join(current_path,'test')

#### Function to download test and validation images

In [None]:
def image_download(x_df,y_df,unique_categories,path,category_column='CategoryName',imageurl_column='ImageUrl'):
    for i in x_df.index:
        for j in unique_categories:            
            if y_df[category_column][i] == j:
                class_path = os.path.join(path,j)
                if not os.path.exists(class_path):
                    os.makedirs(class_path)
                filename = '%s.jpg'%i
                fullfilename = os.path.join(class_path,filename)
                if not os.path.exists(fullfilename):
                    try:
                        urllib.request.urlretrieve(x_df[imageurl_column][i],fullfilename)
                    except:
                        print('Did not download:')
                        print(i)
                        print(x_df[imageurl_column][i])
                        pass
    return

#### Download Training Images

In [None]:
image_download(x_df=x_df_train,y_df=y_df_train,unique_categories=unique_categories,path=train_path)

#### Download Validation Images

In [None]:
image_download(x_df=x_df_test,y_df=y_df_test,unique_categories=unique_categories,path=validation_path)

#### Download Test Images

In [None]:
for i in df_test.index:
    class_path = os.path.join(test_path,'New folder')
    if not os.path.exists(class_path):
        os.makedirs(class_path)
    filename = '%s.jpg'%i
    fullfilename = os.path.join(class_path,filename)
    if not os.path.exists(fullfilename):
        try:
            urllib.request.urlretrieve(df_test['ImageUrl'][i],fullfilename)
        except:
            print('Did not download:')
            print(i)
            print(df_test['ImageUrl'][i])
            pass

## Text Classification

**Aim**

* To build a pipeline for text features
* To build classifiers for the text features

**Steps in pipeline**

* Convert to lowercase
* Remove stop words
* Convert a collection of text documents to a matrix of token counts
* Transform a count matrix to a normalized tf or tf-idf representation. Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. This is a common term weighting scheme in information retrieval, that has also found good use in document classification.

**Classifiers used**

* Multinomial Naive Bayes
* Linear SVM with SGD
* Random Forests
* Logistic Regression

### Multinomial NB

In [227]:
text_clf_multinomial_nb = Pipeline([('vect', CountVectorizer(stop_words='english',lowercase=True)),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
text_clf_multinomial_nb.fit(x_df_train['Title'], y_df_train['CategoryName'])
text_clf_multinomial_nb.score(x_df_test['Title'],y_df_test['CategoryName'])

0.84672742336371165

### SGD Classifier

In [228]:
text_clf_sgd = Pipeline([('vect', CountVectorizer(stop_words='english',lowercase=True)),('tfidf', TfidfTransformer()),('clf', SGDClassifier())])
text_clf_sgd.fit(x_df_train['Title'], y_df_train['CategoryName'])
text_clf_sgd.score(x_df_test['Title'],y_df_test['CategoryName'])

0.90306545153272577

### Random Forest Classifier

In [231]:
text_clf_rf = Pipeline([('vect', CountVectorizer(stop_words='english',lowercase=True)),('tfidf', TfidfTransformer()),('clf', RandomForestClassifier())])
text_clf_rf.fit(x_df_train['Title'], y_df_train['CategoryName'])
text_clf_rf.score(x_df_test['Title'],y_df_test['CategoryName'])

0.84424192212096105

### Logistic Regression Classifier

In [232]:
text_clf_logis = Pipeline([('vect', CountVectorizer(stop_words='english',lowercase=True)),('tfidf', TfidfTransformer()),('clf', LogisticRegression())])
text_clf_logis.fit(x_df_train['Title'], y_df_train['CategoryName'])
text_clf_logis.score(x_df_test['Title'],y_df_test['CategoryName'])

0.87903893951946976

**Note**

* Due to time and computational constraints, I am not going be doing a grid search cross validation for the above models. 
* I believe that a test dataset accuracy of above 80% is sufficient enough as I am going to take the average of all classifiers.

## Image Classification

**Aim**

* To train a classifier over a pre-trained VGG16 model

#### Define Variables

In [6]:
# dimensions of our images.
img_width, img_height = 150, 150

top_model_weights_path = 'bottleneck_fc_model1.h5'
train_data_dir = 'train'
validation_data_dir = 'validation'
epochs = 50
batch_size = 32

datagen = ImageDataGenerator(rescale=1. / 255)

#### Building VGG16 network

In [7]:
model = applications.VGG16(include_top=False, weights='imagenet')

Running VGG16 is expensive, as working on CPU. I want to only do it only once. This prevents me from using data augmentation which can improve my model by preventing overfitting.

#### Generating Training Feature Set

In [216]:
#train generator
train_generator = datagen.flow_from_directory(
    train_data_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode= 'categorical',
    shuffle=False)

#train variables
nb_train_samples = len(train_generator.filenames)
num_classes = len(train_generator.class_indices)
predict_size_train = int(math.ceil(nb_train_samples / batch_size))

#creaing training feature set
bottleneck_features_train = model.predict_generator(
    train_generator, predict_size_train)
np.save('bottleneck_features_train',bottleneck_features_train)

#train labels
train_labels = train_generator.classes
train_labels = to_categorical(train_labels,63)

Found 4824 images belonging to 63 classes.


#### Generating Validation Feature Set

In [8]:
#validation generator
generator = datagen.flow_from_directory(
    validation_data_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode= 'categorical',
    shuffle=False)

#test variables
nb_validation_samples = len(generator.filenames)
predict_size_validation = int(math.ceil(nb_validation_samples / batch_size))

#creaing validation feature set
bottleneck_features_validation = model.predict_generator(
    generator, predict_size_validation)
np.save('bottleneck_features_validation',bottleneck_features_validation)

#test labels
validation_labels = generator.classes
validation_labels = to_categorical(validation_labels,63)

Found 1207 images belonging to 63 classes.


#### Loading Training and Validation feature set

In [38]:
#loading training and validation feature set generated above
train_data = np.load('bottleneck_features_train.npy')
validation_data = np.load('bottleneck_features_validation.npy')

#### Training top classifier model

In [None]:
from keras import optimizers
rmsprop = optimizers.RMSprop(lr=0.001)

In [40]:
top_model = Sequential()
top_model.add(Flatten(input_shape=train_data.shape[1:]))
top_model.add(Dense(256, activation='relu'))
top_model.add(Dropout(0.6))
top_model.add(Dense(63, activation='softmax'))

top_model.compile(optimizer=rmsprop,loss='categorical_crossentropy', metrics=['accuracy'])

top_model.fit(train_data, train_labels,
        epochs=30,
        batch_size=batch_size,
        validation_data=(validation_data, validation_labels))
top_model.save_weights(top_model_weights_path)

Train on 4824 samples, validate on 1207 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


#### Making Final Image Recognition Model

In [8]:
model_vgg = applications.VGG16(weights='imagenet', include_top=False, input_shape=(150, 150, 3))

top_model = Sequential()
top_model.add(Flatten(input_shape=model_vgg.output_shape[1:]))
top_model.add(Dense(256, activation='relu'))
top_model.add(Dropout(0.6))
top_model.add(Dense(63, activation='softmax'))

top_model.load_weights('bottleneck_fc_model1.h5')

model = Model(inputs = model_vgg.input, outputs = top_model(model_vgg.output))

#### Predict Category based on Test Images

In [None]:
df_test.reset_index(inplace=True)
filename=[]
vgg16_predictions=[]

In [212]:
final_data_dir='test'
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
        final_data_dir,
        target_size=(img_width, img_height),
        class_mode=None,
        batch_size=1)

filenames = test_generator.filenames
nb_samples = len(filenames)

predict = model.predict_generator(test_generator,steps = nb_samples)

Found 2100 images belonging to 1 classes.


#### Make image classification results into a data frame

In [213]:
for f in filenames:
    filename.append(f)
for r in predict:
    vgg16_predictions.append(np.argmax(r))

In [214]:
ir_res = pd.DataFrame(
    {'filename': [f[11:-4] for f in filename],
     'category': vgg16_predictions,
    })
ir_res.to_csv('ir_res.csv')
len(ir_res)

57030

# Final Product Classification

#### Getting category name from index

In [238]:
x=pd.read_csv('ir_res.csv')
index=train_generator.class_indices
y=list(x['category'])
categories_p=[]
for i in y:
    for j in index:
        if i == index[j]:
            categories_p.append(j)

#### Collecting all predictions from all classifiers

In [243]:
df_test.set_index('ASIN',inplace=True)

In [233]:
a=text_clf_multinomial_nb.predict(df_test['Title'])
b=text_clf_sgd.predict(df_test['Title'])
c=text_clf_rf.predict(df_test['Title'])
d=text_clf_logis.predict(df_test['Title'])

In [244]:
ir_res_new = pd.DataFrame(
    {'ASIN': x['filename'],
     'CNN Category': categories_p,
    })
ir_res_new.set_index('ASIN',inplace=True)

predictions = df_test.join(ir_res_new)
predictions.drop(['Title','ImageUrl','BrandName'],1, inplace=True)
predictions['Multinomial NB Category'] = a
predictions['SVC Category'] = b
predictions['RF Category'] = c
predictions['LogisticR Category'] = d

#### Take Average of all 5 classifiers

In [246]:
final_pred =[]
for i in range(len(df_test)):
    lst = [a[i],b[i],c[i],d[i],predictions['CNN Category'][i]]
    final_pred.append(max(set(lst), key=lst.count))
df_test['Category'] = final_pred

In [247]:
df_test.to_csv('final_prediction.csv')
predictions.to_csv('predictions_allClassifiers.csv')

# Future work

Due to time and computational constraints, I was not able to improve upon my model. Some methods by which we can get a robust model is given below:

**Image Classification**

*  Fine tuning of the last convolutional block of VGG16 alongwith my top classifier could improve validation accuracy
* Running VGG only once for training prevents me from using data augmentation. Looking for ways to use data augmentation can really help
* More training data. We have approximately 60 samples per category. In my opinion, more training data will lead to better image categorization
* Use more aggressive dropout
* Use of L1 and L2 regularization (also known as "weight decay")

**Text Classification**
* Grid Search CV of classifiers
* Use text preprocessing steps including stemming, lemmatization, and object standardization
* Compare with other classifiers available on scikit-learn

**Combined Classifier**

For the final model, I can create a neural network that has a convolutional branch one on side while the other branch processes vectorized words: One part of the model uses a convolutional neural network to process the images, while the other processes the bag-of-words text. The two are combined for the final classification. The VGG convolutional network provides the computer vision processing of the images, while simple fully-connected neural networks process the text. The last layer of neurons is simply the combination from both parts of the model, and is used to produce the final classification.

# References

https://blog.dataweave.com/implementing-a-machine-learning-based-ecommerce-product-classification-system-f846d894148b

https://blog.insightdatascience.com/classifying-e-commerce-products-based-on-images-and-text-14b3f98f899e

http://cbonnett.github.io/Insight.html

https://techblog.commercetools.com/boosting-product-categorization-with-machine-learning-ad4dbd30b0e8

http://cs229.stanford.edu/proj2011/LinShankar-Applying%20Machine%20Learning%20to%20Product%20Categorization.pdf

https://github.com/georgetown-analytics/product-classifier

https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html

https://www.analyticsvidhya.com/blog/2017/01/ultimate-guide-to-understand-implement-natural-language-processing-codes-in-python/