In [None]:
import warnings
warnings.filterwarnings("ignore")

from IPython.display import display_html
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import random
import requests
import json
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use("seaborn-muted")

train_fraction = 0.7
test_fraction = 0.3
max_features = 3000
fast = True

In [None]:
df = pd.read_csv("nlp-dataset/train.csv")

In [None]:
df.head(5).T

In [None]:
df.category.unique()

In [None]:
df.category.value_counts()

In [None]:
test = pd.read_csv("nlp-dataset/test.csv")

In [None]:
test.head(5)

In [None]:
df.isnull().sum()

In [None]:
df.fillna("", inplace=True)

In [None]:
df["full_text"] = df.title + " " + df.description

In [None]:
encoder = LabelEncoder().fit(df.category)
df["category_id"] = encoder.transform(df.category)

In [None]:
train_df = df.sample(frac=train_fraction).reset_index(drop=True)
test_df = df[~df.index.isin(train_df.index)].reset_index(drop=True)

In [None]:
text_column = "full_text"
vec = TfidfVectorizer(max_features=max_features, ngram_range=(1, 3), analyzer="word", 
                      stop_words="english", token_pattern=r"(?u)\b\w+\b").fit(train_df[text_column])
assert len(vec.vocabulary_) == max_features

In [None]:
train_dataset = pd.DataFrame(vec.transform(train_df.loc[:,text_column]).todense(), columns=vec.vocabulary_)

In [None]:
test_dataset = pd.DataFrame(vec.transform(test_df.loc[:,text_column]).todense(), columns=vec.vocabulary_)

In [None]:
def model_score(actual, predicted):
    return 1.0 * np.sum((predicted == actual)) / actual.size

Bag of Word Model

---

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalAveragePooling1D
from sklearn.preprocessing import MinMaxScaler
from keras.layers.advanced_activations import PReLU
from keras.optimizers import SGD

In [None]:
def OneHotEncoding(df, category=55):
    vector = np.zeros([df.shape[0], category])
    for i in xrange(train_df.shape[0]):
        vector[i][train_df.ix[i, 'category_id']] = 1
    
    return vector

In [None]:
# losses: binary_crossentropy, categorical_crossentropy
def get_bow_model(input_shape, output_dim):
    model = Sequential()
    model.add(Dense(512, input_shape=input_shape))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(output_dim))
    model.add(Activation('sigmoid'))
    model.compile(
        loss='categorical_crossentropy',
        optimizer="Adadelta",
        metrics=['accuracy'])
    return model

In [None]:
pred = None
label = OneHotEncoding(train_df, )
model = get_bow_model((train_dataset.shape[1],), 55)
model.fit(train_dataset.values, 
          label,
          batch_size=32, nb_epoch=20, verbose=3)
pred = model.predict(test_dataset.values) if pred is None else pred + model.predict(test_dataset.values)

In [None]:
prediction = pred.argmax(axis=1)
model_score(test_df.category_id.values, prediction)

In [None]:
test_df = test
test_df.fillna("", inplace=True)
test_df["full_text"] = test_df.title + " " + test_df.description

In [None]:
test_dataset = pd.DataFrame(vec.transform(test_df.loc[:,text_column]).todense(), columns=vec.vocabulary_)
pred = model.predict(test_dataset.values)
prediction = pred.argmax(axis=1)

In [None]:
test_df["category_id"] = prediction
test_df["category"] = encoder.inverse_transform(prediction)
test_df = test_df[["id", "category"]]
test_df.to_csv("output.csv", index=False)