In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# import os
# print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from __future__ import print_function, division
from builtins import range

In [4]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from gensim.models import KeyedVectors

In [5]:
from nltk.corpus import stopwords


In [6]:
class GloveVectorizer:
  def __init__(self):
    # load in pre-trained word vectors
    print('Loading word vectors...')
    word2vec = {}
    embedding = []
    idx2word = []
    with open('glove840b300dtxt/glove.840B.300d.txt',encoding='utf-8') as f:
      # is just a space-separated text file in the format:
      # word vec[0] vec[1] vec[2] ...
      for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
    print('Found %s word vectors.' % len(word2vec))

    # save for later
    self.word2vec = word2vec
    self.embedding = np.array(embedding)
    self.word2idx = {v:k for k,v in enumerate(idx2word)}
    self.V, self.D = self.embedding.shape

  def fit(self, data):
    pass

  def transform(self, data):
    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.lower().split()
      vecs = []
      for word in tokens:
        if word in self.word2vec:
          vec = self.word2vec[word]
          vecs.append(vec)
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X

  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

In [7]:
class Word2VecVectorizerEN:
  def __init__(self):
    print("Loading in word vectors...")
    self.word_vectors = KeyedVectors.load_word2vec_format(
      'GoogleNews-vectors-negative300.bin',
      binary=True
    )
    print("Finished loading in word vectors")

  def fit(self, data):
    pass

  def transform(self, data):
    # determine the dimensionality of vectors
    v = self.word_vectors.get_vector('king')
    self.D = v.shape[0]

    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m = 0
      for word in tokens:
        try:
          # throws KeyError if word not found
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X


  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)


In [8]:
class Word2VecVectorizerID:
  def __init__(self):
    print("Loading in word vectors...")
    self.word_vectors = KeyedVectors.load_word2vec_format(
      'id_sahdan.bin',
      binary=True
    )
    print("Finished loading in word vectors")

  def fit(self, data):
    pass

  def transform(self, data):
    # determine the dimensionality of vectors
    v = self.word_vectors.get_vector('yang')
    self.D = v.shape[0]

    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m = 0
      for word in tokens:
        try:
          # throws KeyError if word not found
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X


  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)


In [9]:
ignore_words = set(stopwords.words('english')).union(set(stopwords.words('indonesian')))
print(len(ignore_words))

936


In [10]:
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
df_cat=pd.read_json('categories.json')

In [11]:
def remove_stop_words(text):
    stopwords = list(ignore_words)
    querywords = text.split()
    resultwords  = [word for word in querywords if word.lower() not in stopwords]
    result = ' '.join(resultwords)
    return result


In [12]:
# Remove stop words
df_train.loc[:,'title']=df_train.title.apply(remove_stop_words)
df_test.loc[:,'title']=df_test.title.apply(remove_stop_words)

In [13]:
df_train.title.head()



0                 nyx sex bomb pallete natural palette
1    etude house precious mineral cushion pearl aur...
2                             milani rose powder blush
3                  etude house baby sweet sugar powder
4            bedak revlon color stay aqua mineral make
Name: title, dtype: object

In [14]:
train,test=train_test_split(df_train,random_state=2019,stratify=df_train.Category,test_size=0.2)

In [15]:
df_train['Group']=df_train.image_path.map(lambda x: x[:7])
df_test['Group']=df_test.image_path.map(lambda x: x[:7])

In [16]:
np.random.seed(2019)

In [17]:
groups=['beauty_','mobile_','fashion']

In [18]:
# some linear models
from sklearn.linear_model import LogisticRegression, BayesianRidge

# SCM for classification
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

import warnings
warnings.filterwarnings('ignore')
np.random.seed(2019)


In [19]:
from sklearn.metrics import accuracy_score

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
# let's build pipeline for parameter search in n_grams and and various models
lr=LogisticRegression()
nb=MultinomialNB()
svc=SVC()
# let's check first three and see which one is best
gnb=GaussianNB()
bnb=BernoulliNB()
br=BayesianRidge()
rf=RandomForestClassifier(n_estimators=150)

In [75]:
vectorizerEN = Word2VecVectorizerEN()
param={'model':[lr,rf]} # Can't use nb because negative values
for group in groups:
    # English vectorization
    print('Results for group: ', group)
    vectorizerEN = Word2VecVectorizerEN()
    X_train_EN = vectorizerEN.fit_transform(df_train[df_train.Group==group].title)
    
    # Indonesian vectorization
    vectorizerID = Word2VecVectorizerID()
    X_train_ID = vectorizerID.fit_transform(df_train[df_train.Group==group].title)
    y_train = df_train[df_train.Group==group].Category.values
   
    # Combination of two models
    df_train_combined=pd.DataFrame(X_train_EN)
    df_train_combined=df_train_combined.join(pd.DataFrame(X_train_ID),rsuffix='ID')
    X_train_com=df_train_combined.values
    
    print ("Results for English only:")
    pipe=Pipeline([('model',lr)])
    grid_search=GridSearchCV(pipe,param_grid=param)
    grid_search.fit(X_train_EN,y_train)
    print('Best parameters: ',grid_search.best_params_)
    print('Best scores: ',grid_search.best_score_)
    
    print ("Results for English and Indonesian combined only:")
    pipe=Pipeline([('model',lr)])
    grid_search=GridSearchCV(pipe,param_grid=param)
    grid_search.fit(X_train_com,y_train)
    print('Best parameters: ',grid_search.best_params_)
    print('Best scores: ',grid_search.best_score_)
    

Loading in word vectors...
Finished loading in word vectors
Results for group:  beauty_
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 2503 / 286583
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 242433 / 286583
Results for English only:




Best parameters:  {'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)}
Best scores:  0.73061905277
Results for English and Indonesian combined only:




Best parameters:  {'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)}
Best scores:  0.729844408077
Results for group:  mobile_
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 985 / 160330
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 83465 / 160330
Results for English only:




Best parameters:  {'model': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)}
Best scores:  0.741869893345
Results for English and Indonesian combined only:




Best parameters:  {'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)}
Best scores:  0.738027817626
Results for group:  fashion
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 303 / 219702
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 201239 / 219702
Results for English only:




Best parameters:  {'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)}
Best scores:  0.551537992371
Results for English and Indonesian combined only:




Best parameters:  {'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)}
Best scores:  0.551510682652


In [69]:
pipe.fit(X_train_EN,y_train)



Pipeline(memory=None,
     steps=[('model', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [None]:
# Just donwloaded new corpus for Indonesian . From this link http://vectors.nlpl.eu/repository/


In [22]:
# Just donwloaded new corpus for Indonesian . From this link http://vectors.nlpl.eu/repository/

class Word2VecVectorizerID2:
  def __init__(self):
    print("Loading in word vectors...")
    self.word_vectors = KeyedVectors.load_word2vec_format(
      'model.txt',
      binary=False,
      unicode_errors='replace'  
        
    )
    print("Finished loading in word vectors")

  def fit(self, data):
    pass

  def transform(self, data):
    # determine the dimensionality of vectors
    v = self.word_vectors.get_vector('yang')
    self.D = v.shape[0]

    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m = 0
      for word in tokens:
        try:
          # throws KeyError if word not found
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X


  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)


In [84]:
vectorizerID2 = Word2VecVectorizerID2()
X_train_ID2 = vectorizerID2.fit_transform(df_train[df_train.Group==group].title)
    

Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 2 / 219702


In [89]:
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [87]:
# Now let's do it with new Indonesian corpus
param={'model':[lr,rf]} # Can't use nb because negative values
for group in groups:
    # English vectorization
    print('Results for group: ', group)
    vectorizerEN = Word2VecVectorizerEN()
    X_train_EN = vectorizerEN.fit_transform(df_train[df_train.Group==group].title)
    
    # Indonesian vectorization using another corpus
    vectorizerID2 = Word2VecVectorizerID2()
    X_train_ID = vectorizerID2.fit_transform(df_train[df_train.Group==group].title)
    y_train = df_train[df_train.Group==group].Category.values
   
    # Combination of two models
    df_train_combined=pd.DataFrame(X_train_EN)
    df_train_combined=df_train_combined.join(pd.DataFrame(X_train_ID),rsuffix='ID')
    X_train_com=df_train_combined.values
    
    print ("Results for English and Indonesian combined only:")
    pipe=Pipeline([('model',lr)])
    grid_search=GridSearchCV(pipe,param_grid=param)
    grid_search.fit(X_train_com,y_train)
    print('Best parameters: ',grid_search.best_params_)
    print('Best scores: ',grid_search.best_score_)

Results for group:  beauty_
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 2503 / 286583
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 5 / 286583
Results for English and Indonesian combined only:




Best parameters:  {'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)}
Best scores:  0.735926415733
Results for group:  mobile_
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 985 / 160330
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 58 / 160330
Results for English and Indonesian combined only:




Best parameters:  {'model': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)}
Best scores:  0.765951475083
Results for group:  fashion
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 303 / 219702
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 2 / 219702
Results for English and Indonesian combined only:




Best parameters:  {'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)}
Best scores:  0.573563281172


In [88]:
# Now let's do it only with new Indonesian corpus
param={'model':[lr,rf]} # Can't use nb because negative values
for group in groups:

    
    # Indonesian vectorization using another corpus
    vectorizerID2 = Word2VecVectorizerID2()
    X_train_ID = vectorizerID2.fit_transform(df_train[df_train.Group==group].title)
    y_train = df_train[df_train.Group==group].Category.values

    
    print ("Results for new Indonesian corpus only:")
    pipe=Pipeline([('model',lr)])
    grid_search=GridSearchCV(pipe,param_grid=param)
    grid_search.fit(X_train_ID,y_train)
    print('Best parameters: ',grid_search.best_params_)
    print('Best scores: ',grid_search.best_score_)

Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 5 / 286583
Results for new Indonesian corpus only:




Best parameters:  {'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)}
Best scores:  0.707166859165
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 58 / 160330
Results for new Indonesian corpus only:




Best parameters:  {'model': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)}
Best scores:  0.73011289216
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 2 / 219702
Results for new Indonesian corpus only:




Best parameters:  {'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)}
Best scores:  0.553613531056


In [23]:
groups_rf=['beauty_','fashion']

In [24]:
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
# Let's try some optimization on random forrest for beauty and fashion
# Now let's do it with new Indonesian corpus
param={'model__n_estimators':[i for i in range(40,220,40)]} # Can't use nb because negative values
groups_rf=['beauty_','fashion']
for group in groups_rf:
    # English vectorization
    print('Results for group: ', group)
    vectorizerEN = Word2VecVectorizerEN()
    X_train_EN = vectorizerEN.fit_transform(df_train[df_train.Group==group].title)
    
    # Indonesian vectorization using another corpus
    vectorizerID2 = Word2VecVectorizerID2()
    X_train_ID = vectorizerID2.fit_transform(df_train[df_train.Group==group].title)
    y_train = df_train[df_train.Group==group].Category.values
   
    # Combination of two models
    df_train_combined=pd.DataFrame(X_train_EN)
    df_train_combined=df_train_combined.join(pd.DataFrame(X_train_ID),rsuffix='ID')
    X_train_com=df_train_combined.values
    
    print ("Results for English and Indonesian combined only for rf optimization:")
    pipe=Pipeline([('model',rf)])
    grid_search=GridSearchCV(pipe,param_grid=param)
    grid_search.fit(X_train_com,y_train)
    print('Best parameters: ',grid_search.best_params_)
    print('Best scores: ',grid_search.best_score_)

Results for group:  beauty_
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 2503 / 286583
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 5 / 286583
Results for English and Indonesian combined only for rf optimization:
Best parameters:  {'model__n_estimators': 200}
Best scores:  0.73679876336
Results for group:  fashion
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 303 / 219702
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 2 / 219702
Results for English and Indonesian combined only for rf optimization:
Best parameters:  {'model__n_estimators': 160}
Best scores:  0.575456755059


In [26]:
# Let's try some optimization on random forrest for beauty and fashion
# Now let's do it with new Indonesian corpus
param={'model__n_estimators':[i for i in range(220,300,20)]} # Can't use nb because negative values
groups_rf=['beauty_','fashion']
for group in ['beauty_']:
    # English vectorization
    print('Results for group: ', group)
    vectorizerEN = Word2VecVectorizerEN()
    X_train_EN = vectorizerEN.fit_transform(df_train[df_train.Group==group].title)
    
    # Indonesian vectorization using another corpus
    vectorizerID2 = Word2VecVectorizerID2()
    X_train_ID = vectorizerID2.fit_transform(df_train[df_train.Group==group].title)
    y_train = df_train[df_train.Group==group].Category.values
   
    # Combination of two models
    df_train_combined=pd.DataFrame(X_train_EN)
    df_train_combined=df_train_combined.join(pd.DataFrame(X_train_ID),rsuffix='ID')
    X_train_com=df_train_combined.values
    
    print ("Results for English and Indonesian combined only for rf optimization:")
    pipe=Pipeline([('model',rf)])
    grid_search=GridSearchCV(pipe,param_grid=param)
    grid_search.fit(X_train_com,y_train)
    print('Best parameters: ',grid_search.best_params_)
    print('Best scores: ',grid_search.best_score_)

Results for group:  beauty_
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 2503 / 286583
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 5 / 286583
Results for English and Indonesian combined only for rf optimization:
Best parameters:  {'model__n_estimators': 260}
Best scores:  0.737653664035


In [29]:
groups

['beauty_', 'mobile_', 'fashion']

In [39]:
train,test=train_test_split(df_train,random_state=2019,stratify=df_train.Category,test_size=0.2)

In [40]:
# Best configuration for 'beauty_' is RandomForrest(n_estimators=260) CV score : 0.737653664035
# Best configuration for 'mobile_' is LogisticRegression() CV score : 0.765951475083
# Best configuration for 'fashion' is RandomForrest(n_estimators=160) CV score : 0.575456755059. It seems to be good
pipe_beauty=Pipeline([('model',RandomForestClassifier(n_estimators=260))])
pipe_mobile=Pipeline([('model',LogisticRegression())])
pipe_fashion=Pipeline([('model',RandomForestClassifier(n_estimators=160))])
groups=['beauty_','mobile_','fashion']
pipes=[pipe_beauty,pipe_mobile,pipe_fashion]
predictions_val={}
predictions_proba_val={}
predictions={}
predictions_proba={}
classes={}
classes_val={}
for i in range(3):
    # English vectorization
    print('Results for group: ', groups[i])
    vectorizerEN = Word2VecVectorizerEN()
    X_train_EN = vectorizerEN.fit_transform(df_train[df_train.Group==groups[i]].title)
    X_test_EN=vectorizerEN.transform(df_test[df_test.Group==groups[i]].title)
    
    # Indonesian vectorization using another corpus
    vectorizerID2 = Word2VecVectorizerID2()
    X_train_ID = vectorizerID2.fit_transform(df_train[df_train.Group==groups[i]].title)
    X_test_ID = vectorizerID2.transform(df_test[df_test.Group==groups[i]].title)
    y_train = df_train[df_train.Group==groups[i]].Category.values
   
    # Combination of two models
    df_train_combined=pd.DataFrame(X_train_EN)
    df_train_combined=df_train_combined.join(pd.DataFrame(X_train_ID),rsuffix='ID')
    X_train_com=df_train_combined.values
    
    df_test_combined=pd.DataFrame(X_test_EN)
    df_test_combined=df_test_combined.join(pd.DataFrame(X_test_ID),rsuffix='ID')
    X_test_com=df_test_combined.values
    
    
    
    model=pipes[i]
    model.fit(X_train_com,y_train)
    classes[groups[i]]=model.classes_
    predictions[groups[i]]=model.predict(X_test_com)
    predictions_proba[groups[i]]=model.predict_proba(X_test_com)
    
    print("Generating predictions for validation set")
    
    # We need validation set and probabilities for validation set for ensembling
    vectorizerEN = Word2VecVectorizerEN()
    X_train_EN = vectorizerEN.fit_transform(train[train.Group==groups[i]].title)
    X_test_EN=vectorizerEN.transform(test[test.Group==groups[i]].title)
    
    # Indonesian vectorization using another corpus
    vectorizerID2 = Word2VecVectorizerID2()
    X_train_ID = vectorizerID2.fit_transform(train[train.Group==groups[i]].title)
    X_test_ID = vectorizerID2.transform(test[test.Group==groups[i]].title)
    y_train = train[train.Group==groups[i]].Category.values
    y_test = test[test.Group==groups[i]].Category.values
   
    # Combination of two models
    df_train_combined=pd.DataFrame(X_train_EN)
    df_train_combined=df_train_combined.join(pd.DataFrame(X_train_ID),rsuffix='ID')
    X_train_com=df_train_combined.values
    
    df_test_combined=pd.DataFrame(X_test_EN)
    df_test_combined=df_test_combined.join(pd.DataFrame(X_test_ID),rsuffix='ID')
    X_test_com=df_test_combined.values
    
    model=pipes[i]
    model.fit(X_train_com,y_train)
    classes_val[groups[i]]=model.classes_
    predictions_val[groups[i]]=model.predict(X_test_com)
    predictions_proba_val[groups[i]]=model.predict_proba(X_test_com)
    

Results for group:  beauty_
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 2503 / 286583
Numer of samples with no words found: 100 / 76545
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 5 / 286583
Numer of samples with no words found: 0 / 76545
Generating predictions for validation set
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 2015 / 229266
Numer of samples with no words found: 488 / 57317
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 4 / 229266
Numer of samples with no words found: 1 / 57317
Results for group:  mobile_
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 985 / 160330
Numer of samples with no words found: 235 / 40417
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 58 / 

In [43]:
for group in groups:
    print('Accuracy on test data of:',group,accuracy_score(test[test.Group==group].Category,predictions_val[group]))

# Accuracy on test data of: beauty_ 0.744054992411
# Accuracy on test data of: mobile_ 0.766505535631
# Accuracy on test data of: fashion 0.579777428825

Accuracy on test data of: beauty_ 0.744054992411
Accuracy on test data of: mobile_ 0.766505535631
Accuracy on test data of: fashion 0.579777428825


In [49]:
classes={}
for i in range(3):
    classes[groups[i]]=pipes[i].classes_


In [50]:
classes

{'beauty_': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16], dtype=int64),
 'mobile_': array([31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
        48, 49, 50, 51, 52, 53, 54, 55, 56, 57], dtype=int64),
 'fashion': array([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], dtype=int64)}

In [51]:
# Save all data both validation set and training set
# This data will be used for modeling ensembling and for final ensemble result
for i in range(3):
    # for ensembling
    df=pd.DataFrame(predictions_proba[groups[i]])
    df.columns=classes[groups[i]]
    df['pred']=df.idxmax(axis=1)
    df['itemid']=df_test[df_test.Group==groups[i]].itemid.values
    df.to_csv(groups[i]+'_test_proba_w2v_all_data.csv',index=False)
    # for ensemble modeling
    df=pd.DataFrame(predictions_proba_val[groups[i]])
    df.columns=classes[groups[i]]
    df['pred']=df.idxmax(axis=1)
    df['itemid']=test[test.Group==groups[i]].itemid.values
    df.to_csv(groups[i]+'_test_proba_w2v_val_data.csv',index=False)


In [None]:
# Now we can combine results

In [36]:
# We need to ensemble our best results for training. let's get best results from nlp with count vectorization with optimized C 
# parameters as well as min_df
# This are optimization results for C parameter in logistic regression

# params={'model__C':[0.2,0.4,0.8,1,2,4,8]}
# {'beauty_': {'model__C': 0.8},
#  'mobile_': {'model__C': 1},
#  'fashion': {'model__C': 0.4}}

# with the corresponding results
# {'beauty_': 0.78344144628257784,
#  'mobile_': 0.8236574564959771,
#  'fashion': 0.6449463364011252}

# Also did some optimization for min_df the best results for mobile was 2 otherwise 1.

# Let's also check if removing stop words helps to improve model. For the same configuration of optimized parameters

# Submit these results and see if it is really improved in leaderboard




Unnamed: 0,itemid,title,Category,image_path
539829,1058517290,xiaomi mi a1 garansi resmi 1 tam,34,mobile_image/3912586cdf51144db783b2bac1ee4d3f.jpg
473036,286463264,blus sexy model shoulder,26,fashion_image/cb790137111e2525cd895c3375fe84c0
252009,1537965166,best sale jafra royal jelly radiance foundatio...,1,beauty_image/5adfe3e65ad2f51a1e67db85a3de8e18.jpg
355772,1465709373,dress midi bodycon casual elegan warna polos p...,22,fashion_image/cbedaa2404fb377dc6c271a1081f7baa
553954,1195506745,promo discon samsung galaxy s7 flat sein garan...,35,mobile_image/bb327fec3989b18f714deb3c07203fb8.jpg
620741,1833821649,samsung galaxy a6 plus 2018 new garansi nasional,35,mobile_image/8959112348969556df29f24d06ecff8f.jpg
486345,129360583,kemeja hitam wanita katun adem lengan pendek b...,27,fashion_image/7a4194bf7b509f29bfc5dcd9cc0355d9
647884,291297752,xiaomi redmi mi5,34,mobile_image/8733bba77a727e4771d51b5a8b48c8a5.jpg
67883,1633487774,jd59 bedak tabur revlon 43gram,3,beauty_image/62367de657a2c83ff5e97a3dde7b1902.jpg
347206,1756097750,gaun midi bodycon wanita model lengan warna hi...,22,fashion_image/a15e616c290fb142b1a8b217951f765c


In [None]:
for i in range(3):
    df=pd.DataFrame(predictions_proba[groups[i]])
    df.columns=classes[groups[i]]
    df['pred']=df.idxmax(axis=1)
    df['itemid']=df_test[df_test.Group==groups[i]].itemid.values
    df.to_csv(groups[i]+'_test_proba_nlp_all_data.csv',index=False)

In [72]:
y_pred=pipe.predict(X_train_EN)

In [74]:
accuracy_score(y_train,y_pred)

0.70672370657017336

In [None]:
vectorizerEN = Word2VecVectorizerEN()

for group in groups:
    # English vectorization
    print('Results for group: ', group)
    vectorizerEN = Word2VecVectorizerEN()
    X_train_EN = vectorizerEN.fit_transform(train[train.Group==group].title)
    X_test_EN = vectorizerEN.transform(train[train.Group==group].title)
    
    # Indonesian vectorization
    vectorizerID = Word2VecVectorizerID()
    X_train_ID = vectorizerID.fit_transform(X_beauty_train)
    y_train = train[train.Group==group].Category.values
    X_test_ID = vectorizerID.transform(X_beauty_test)
    y_test = test[test.Group==group].Category.values
    
    # Combination of two models
    df_train_combined=pd.DataFrame(X_train_EN)
    df_train_combined=df_train_combined.join(pd.DataFrame(X_train_ID),rsuffix='ID')
    df_test_combined=pd.DataFrame(X_test_EN)
    df_test_combined=df_test_combined.join(pd.DataFrame(X_test_ID),rsuffix='ID')
    X_train_com=df_train_combined.values
    X_test_com=df_test_combined.values
    
    



    

In [81]:
#vectorizer = GloveVectorizer()
vectorizerEN = Word2VecVectorizerEN()
X_train_EN = vectorizerEN.fit_transform(X_beauty_train)
#Ytrain = y_beauty_train

X_test_EN = vectorizerEN.transform(X_beauty_test)
#Ytest = y_beauty_test

Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 1639 / 200608
Numer of samples with no words found: 680 / 85975


In [82]:
X_train_EN.shape

(200608, 300)

In [83]:
#vectorizer = GloveVectorizer()
vectorizerID = Word2VecVectorizerID()
X_train_ID = vectorizerID.fit_transform(X_beauty_train)
y_train = y_beauty_train

X_test_ID = vectorizerID.transform(X_beauty_test)
y_test = y_beauty_test

Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 164061 / 200608
Numer of samples with no words found: 70373 / 85975


In [84]:
X_train_ID.shape

(200608, 300)

In [96]:
df_train_combined=pd.DataFrame(X_train_EN)

In [99]:
df_train_combined=df_train_combined.join(pd.DataFrame(X_train_ID),rsuffix='ID')

In [100]:
df_test_combined=pd.DataFrame(X_test_EN)
df_test_combined=df_test_combined.join(pd.DataFrame(X_test_ID),rsuffix='ID')

In [103]:
df_train_combined.shape,df_test_combined.shape

((200608, 600), (85975, 600))

In [105]:
model = RandomForestClassifier(n_estimators=200)
model.fit(df_train_combined.values,y_train.values)
print("train score", model.score(df_train_combined.values,y_train.values))
print("test score:", model.score(df_test_combined.values,y_test.values))

train score 0.898877412665
test score: 0.730386740331


# Let's do the same for fashion 

In [106]:
#vectorizer = GloveVectorizer()
vectorizerID = Word2VecVectorizerID()
X_train_ID = vectorizerID.fit_transform(X_fashion_train)
y_train = y_fashion_train

X_test_ID = vectorizerID.transform(X_fashion_test)
y_test = y_fashion_test

Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 122720 / 153791
Numer of samples with no words found: 52577 / 65911


In [107]:
#vectorizer = GloveVectorizer()
vectorizerEN = Word2VecVectorizerEN()
X_train_EN = vectorizerEN.fit_transform(X_fashion_train)
#Ytrain = y_beauty_train

X_test_EN = vectorizerEN.transform(X_fashion_test)
#Ytest = y_beauty_test

Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 144 / 153791
Numer of samples with no words found: 68 / 65911


In [108]:
X_train_EN.shape

(153791, 300)

In [109]:
X_train_ID.shape

(153791, 300)

In [110]:
df_train_combined=pd.DataFrame(X_train_EN)

In [111]:
df_train_combined=df_train_combined.join(pd.DataFrame(X_train_ID),rsuffix='ID')

In [112]:
df_test_combined=pd.DataFrame(X_test_EN)
df_test_combined=df_test_combined.join(pd.DataFrame(X_test_ID),rsuffix='ID')

In [113]:
df_train_combined.shape,df_test_combined.shape

((153791, 600), (65911, 600))

In [114]:
model = RandomForestClassifier(n_estimators=200)
model.fit(df_train_combined.values,y_train.values)
print("train score", model.score(df_train_combined.values,y_train.values))
print("test score:", model.score(df_test_combined.values,y_test.values))

train score 0.928266283463
test score: 0.560968578841


In [15]:
model = RandomForestClassifier(n_estimators=200)

In [16]:
model.fit(Xtrain,Ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
print("train score", model.score(Xtrain,Ytrain))
print("test score:", model.score(Xtest,Ytest))

train score 0.896150701866
test score: 0.731177667927


In [18]:
predictions = model.predict(Xtest)

In [19]:
from sklearn.metrics import classification_report,accuracy_score

In [20]:
print(classification_report(predictions,Ytest))

              precision    recall  f1-score   support

           0       0.30      0.60      0.40       585
           1       0.59      0.76      0.66      6726
           2       0.79      0.85      0.82      3282
           3       0.88      0.80      0.84     26695
           4       0.74      0.61      0.67     15794
           5       0.75      0.75      0.75     16427
           6       0.33      0.43      0.37       480
           7       0.70      0.74      0.72      3265
           8       0.51      0.59      0.54      1574
           9       0.37      0.72      0.49      1268
          10       0.44      0.59      0.50       246
          11       0.38      0.55      0.45       849
          12       0.85      0.73      0.79      7672
          13       0.43      0.63      0.51       588
          14       0.14      0.40      0.21       295
          15       0.12      0.43      0.19        47
          16       0.09      0.34      0.14       182

   micro avg       0.73   

In [21]:
print(accuracy_score(predictions,Ytest))

0.731177667927


In [None]:
#from sklearn.linear_model import LogisticRegression

In [None]:
#lr = LogisticRegression()

In [None]:
#lr.fit(Xtrain,Ytrain)

In [None]:
#predictions_lr = lr.predict(Xtest)

In [None]:
#print(accuracy_score(predictions_lr,Ytest))