# Data

In [1]:
import pandas as pd
import sys
sys.path.append("..")

In [5]:
cats = pd.read_csv('example_categories_dataset.csv',sep=',')

In [32]:
train = pd.read_csv('example_train_dataset.csv',sep=',')

In [8]:
test = pd.read_csv('example_test_dataset.csv',sep=',')

In [46]:
train.head(3)

Unnamed: 0,item_id,title,description,price,category_id
0,0,Skateboard,Evolve Bamboo GTX. This is a very (!!!) good s...,139.0,5
1,1,Good telephone😃,iPhone 10X best condition you can find ✌️ 🤟 🤘,599.0,36
2,2,Barbie Doll #365,Discover the best selection of Barbie Dolls at...,26.5,24


In [None]:
# looks similar to the train
test.head(3)

In [49]:
cats.head(3)

Unnamed: 0,category_id,name
0,24,Goods for kids|Age before 3 years old|Barbie d...
1,5,Sports|Skateboards and roller-skating
2,19,Education|Advanced Mathematics|Calculus


# Text preprocessing

In [50]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruslan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
X_train, y_train = train['title'].values+' '+train['description'].values, train['category_id'].values
X_test = test['title'].values+' '+test['description'].values

In [53]:
import re

In [54]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-zа-я #+_]')
STOPWORDS = set(stopwords.words('english')) # can be 'russian'

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english') # can be 'russian'

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(repl=' ',string=text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub(repl='',string=text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in STOPWORDS]) # delete stopwords from text
    return text

In [55]:
# Preprocess init data
import time
start = time.time()

X_train = [text_prepare(x) for x in X_train]
X_test = [text_prepare(x) for x in X_test]

end = time.time()
print(end-start)

1.1799769401550293


In [56]:
X_train[:3]

['skateboard evolv bamboo gtx good skateboard model #1',
 'good telephon iphon 10x best condit find',
 'barbi doll #365 discov best select barbi doll offici barbi websit shop latest fashionista']

In [None]:
X_test[:3]

# TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer with a proper parameters choice
# Fit the vectorizer on the train set
# Transform the train, test sets and return the result

tfidf_vectorizer = TfidfVectorizer(min_df=5,max_df=0.9,ngram_range=(1,2),token_pattern='(\S+)')

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Cross-validation and model selection

For optimization purposes (hardware limitations) CV is counducted for N_TRAIN_INSTANCES train sample size

In [16]:
start = time.time()

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

models = [
    RandomForestClassifier(n_estimators=200,max_depth=3,random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0)
]

CV = 3
cv_df = pd.DataFrame(index=range(CV*len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model,X_train_tfidf,y_train,scoring='accuracy',cv=CV)
    for fold_idx,accuracy in enumerate(accuracies):
        entries.append((model_name,fold_idx,accuracy))
cv_df = pd.DataFrame(entries,columns=['model_name','fold_idx','accuracy'])

end = time.time()
print(end-start)

1976.016314983368


In [17]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.451884
1,RandomForestClassifier,1,0.466451
2,RandomForestClassifier,2,0.46662
3,LinearSVC,0,0.886707
4,LinearSVC,1,0.886528
5,LinearSVC,2,0.888472
6,MultinomialNB,0,0.865094
7,MultinomialNB,1,0.864129
8,MultinomialNB,2,0.865598
9,LogisticRegression,0,0.877295


In [18]:
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC                 0.887236
LogisticRegression        0.877516
MultinomialNB             0.864940
RandomForestClassifier    0.461651
Name: accuracy, dtype: float64

We select **LinearSVC** as it has the highest mean accuracy

# Parameters tuning - Grid Search

In [19]:
start = time.time()

from sklearn.model_selection import GridSearchCV

parameters_svm = {
                  'C':[0.01,0.05,0.1,0.2,0.3,0.4,0.5,1,3,5]
                 }
clf = LinearSVC()
gs_clf = GridSearchCV(clf,param_grid=parameters_svm,n_jobs=-1)
gs_clf = gs_clf.fit(X_train_tfidf, y_train)

end = time.time()
print(end-start)

467.786500453949


In [20]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)
C_best = gs_clf.best_params_['C']

0.8893889282701112
{'C': 0.4}


# Multi-class classification

In [22]:
clf = LinearSVC(C=C_best).fit(X_train_tfidf, y_train)

In [23]:
y_test_predicted_labels_tfidf = clf.predict(X_test_tfidf)

In [24]:
test['predicted_cat'] = y_test_predicted_labels_tfidf

In [None]:
test.head()

In [28]:
test[['item_id','predicted_cat']].to_csv('classification_result.csv')