# Setup

In [None]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import seaborn as sns
color = sns.color_palette()
import matplotlib.pyplot as plt 
%matplotlib inline

from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))
from nltk import word_tokenize, ngrams

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import itertools
import xgboost as xgb

# Pre-processing

## Importing...

In [None]:
# df = pd.read_csv('pizza.csv')
# df = pd.read_csv('pizza.csv', parse_dates=['dates'])
# df = pd.read_csv('pizza.csv', usecols=['foo', 'bar'])

df = {
    "cooking": pd.read_csv('../dataset/processed/cooking.csv', usecols=['title', 'content']),
    "crypto": pd.read_csv('../dataset/processed/crypto.csv', usecols=['title', 'content']),
    "robotics": pd.read_csv('../dataset/processed/robotics.csv', usecols=['title', 'content']),
    "biology": pd.read_csv('../dataset/processed/biology.csv', usecols=['title', 'content']),
    "travel": pd.read_csv('../dataset/processed/travel.csv', usecols=['title', 'content']),
    "diy": pd.read_csv('../dataset/processed/diy.csv', usecols=['title', 'content']),
    #"physics": pd.read_csv('physics.csv'),
}

## Generating new .csv file with title+content and class columns...

In [None]:
with open('../dataset/processed/data.csv', 'w') as f:
    f.write('title_content|label\n')
    for _class in df:
        df[_class]['title_content'] = df[_class][['title', 'content']].apply(lambda x: '{} {}'.format(x[0],x[1]), axis=1)
        df[_class]['label'] = _class
        df[_class].to_csv(f, sep='|', columns=['title_content', 'label'], header=False, index=False)

# Data Analysis

## Exploration

In [None]:
dataset = pd.read_csv('../dataset/processed/data.csv', sep='|')
# dataset.head()
# dataset.tail()
dataset.sample(5)
# dataset.shape
dataset.describe()
# dataset.info()

## Labels distribution

### Absolut numbers & Percentage

In [None]:
labels = dataset['label'].value_counts()
print(labels.describe())
print(labels.sort_index())
print(labels.sort_index()/labels.sum()*100)

### Histogram

In [None]:
fig = plt.figure(figsize=(20, 10))
ax1 = sns.countplot(dataset['label'].sort_values())
plt.ylabel('Observations', fontsize=12)
plt.xlabel('Labels', fontsize=12)
plt.xticks(rotation='vertical')
plt.title('Labels frequency histogram')
plt.show()

## Word distribution

### Statistics of the number of words (size) of title_content text field

In [None]:
dataset['size'] = dataset['title_content'].apply(lambda x : len(str(x).split()))
sizes = dataset['size'].value_counts()

In [None]:
dataset['size'].describe()
print('The top 20 most frequent size of title_content, and their respective frequency:')
print(sizes.nlargest(20))

### Histogram

In [None]:
fig = plt.figure(figsize=(20, 10))
ax1 = sns.barplot(sizes.index, sizes.values, alpha=0.8)
ax1.set_xticklabels([])
plt.title('Number of words frequency histogram')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Number of words', fontsize=12)
plt.show()

## Data example

### Robotics texts with more than 200 words

In [None]:
filtered_data = dataset[(dataset.label == 'robotics') & (dataset.title_content.apply(lambda x : len(str(x).split())) > 200)]
filtered_data.describe()

### Cell content example

In [None]:
line=61041
print('TEXT: {0}'.format(dataset.loc[61041, 'title_content']))
print('LABEL: {0}'.format(dataset.loc[61041, 'label']))
print('LENGTH: {0} words.'.format(len(dataset.loc[61041, 'title_content'].split())))

# Sampling

In [None]:
ds = {}
for label in labels.keys():
    ds[label] = dataset[dataset.label == label]
#     ds[label] = resample(ds[label], replace=False, n_samples=labels.min(), random_state=711)
ds = pd.concat(ds[label] for label in ds)

fig = plt.figure(figsize=(20, 10))
ax1 = sns.countplot(ds['label'].sort_values())
plt.ylabel('Observations', fontsize=12)
plt.xlabel('Labels', fontsize=12)
# plt.xticks(rotation='vertical')
plt.title('Labels Histogram')
plt.show()

# Bag-of-Words

In [None]:
tfidf = TfidfVectorizer(analyzer='word',
                        stop_words=STOP_WORDS,
                        ngram_range=(1,1),
                        max_df=0.7, min_df=2,
                        sublinear_tf=True)
X = tfidf.fit_transform(ds['title_content'])
print(X.shape)

l_enc = LabelEncoder()
y = l_enc.fit_transform(ds['label'])
print('Encoded labels: ', list([(i, l_enc.classes_[i]) for i in range(0, len(l_enc.classes_))]))

# Dimensionality reduction

In [None]:
svd = TruncatedSVD(n_components=1000, algorithm='randomized')
X_svd = svd.fit_transform(X)
print('Shape of svd matrix: ', X_svd.shape)

# Split Valid/Train/Test examples

In [None]:
# X = np.concatenate([svd_titulo, svd_resumo], axis=1)
X_temp, X_valid, y_temp, y_valid = train_test_split(X_svd, y, test_size=0.1, random_state=283)
X_train, X_test, y_train, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=283)

print('X_train matrix shape is: {0}'.format(X_train.shape))
print('X_test matrix shape is: {0}'.format(X_test.shape))
print('X_valid matrix shape is: {0}'.format(X_valid.shape))
print('y_train matrix shape is: {0}'.format(y_train.shape))
print('y_test matrix shape: {0}'.format(y_test.shape))
print('y_valid matrix shape: {0}'.format(y_valid.shape))

# Experiments

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        plt.title(title + " normalized confusion matrix")
    else:
        plt.title(title + ' confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True class')
    plt.xlabel('Predicted class')
    plt.show()

## Naïve-Bayes Gaussian

In [None]:
nbg = GaussianNB()
nbg.fit(X_train, y_train)
y_pred = nbg.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
plot_confusion_matrix(cm, l_enc.classes_, title='Naïve-Bayes')

## Naïve-Bayes Bernoulli

In [None]:
nbb = BernoulliNB()
nbb.fit(X_train, y_train)
y_pred = nbb.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
plot_confusion_matrix(cm, l_enc.classes_, title='Naïve-Bayes')

## Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
plot_confusion_matrix(cm, l_enc.classes_, title='Naïve-Bayes')