# Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from scipy import sparse
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm

In [None]:
class ProprecessingData:
  def load_data(self) -> pd.DataFrame:
    """
    Read file csv withd pandas
    """
    name = ["URL", "category"]
    path_file = '../input/url-classification-dataset-dmoz/URL Classification.csv'
    df = pd.read_csv(path_file, names=name, na_filter=False)
    X = df["URL"]
    y = df["category"]
    
    return X, y
  def split_data(self, test_size) -> pd.DataFrame:
    """
    Spilit data into train set and test set
    """
    X, y = self.load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    
    return X_train, X_test, y_train, y_test
  def tfidf_train(self, X_train: pd.DataFrame) -> sparse.csr.csr_matrix:
    """
    Fit X for TFIDF
    Output : Vector TIFIDF type csr_matrix
    """
    self.vectorizer = CountVectorizer(stop_words = ['http', 'www', 'com', 'net',
                                                    'org', 'jp', 'bc', 
                                                    'html', 'htm', 'index'])
    word_count_vector = self.vectorizer.fit_transform(X_train)
    self.tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True) 
    tf_idf_vector = self.tfidf_transformer.fit_transform(word_count_vector)
    
    return tf_idf_vector
  def tfidf_test(self, X_test: pd.DataFrame) -> sparse.csr.csr_matrix:
    """
    Fit X for TFIDF
    Output : Vector TIFIDF type csr_matrix
    """
    word_count_vector_test = self.vectorizer.transform(X_test)
    tf_idf_vector_test = self.tfidf_transformer.transform(word_count_vector_test)
    return tf_idf_vector_test

In [None]:
propre_data = ProprecessingData()
X_train, X_test, y_train, y_test = propre_data.split_data(0.2)

In [None]:
tf_idf_vector = propre_data.tfidf_train(X_train)

In [None]:
tf_idf_vector_test = propre_data.tfidf_test(X_test)

# Visualize Data

In [None]:
labels = ['Adult', 'Arts', 'Business', 'Computers', 'Games', 'Health', 'Home', 'Kids',
          'News', 'Recreation', 'Reference', 'Science', 'Shopping', 'Society', 'Sports']
labels_count = np.unique(y_train, return_counts=True)
fig, ax = plt.subplots(figsize=(20, 8))
ax.bar(labels_count[0], labels_count[1], 0.5, tick_label=labels, color='green')
plt.show()

In [None]:
labels = ['Adult', 'Arts', 'Business', 'Computers', 'Games', 'Health', 'Home', 'Kids',
          'News', 'Recreation', 'Reference', 'Science', 'Shopping', 'Society', 'Sports']
labels_count = np.unique(y_test, return_counts=True)
fig, ax = plt.subplots(figsize=(20, 8))
ax.bar(labels_count[0], labels_count[1], 0.5, tick_label=labels, color='green')
plt.show()

We notice here that the data is unbalanced, uneven between the class labels.

In [None]:
print(tf_idf_vector_test.shape)
print(type(tf_idf_vector_test))

The shape and data type of data when converted to tf_idf is csr_matrix.

# Build Model

In [None]:
class MultinomialNB:
  def __init__(self, alpha: float=1.0):
    self.alpha = alpha
  def fit(self, X_train: pd.DataFrame, y_train: pd.DataFrame):
    """
    Fit data training to Naive Bayes
    Input: X_train: type csr_matrix 
    y_train: list
    """
    m, n = X_train.shape
    self._classes = np.unique(y_train)
    # Count class
    n_classes = len(self._classes)
    # Init matrix prior,likelihood
    self._priors = np.zeros(n_classes)
    self._likelihood = np.zeros((n_classes, n))
    for idx, c in enumerate(self._classes):
      bool_c = np.array(c==y_train)
      # Data of class c
      X_train_c = X_train[bool_c]
      # Caculate prior and likelihood
      self._priors[idx] = (X_train_c.shape[0] / m)
      self._likelihood[idx,:] =  np.log((X_train_c.sum(axis=0) + self.alpha) / np.sum((X_train_c.sum(axis=0) + self.alpha)))
  def cal_c_likelihood(self, c_likeli, x_test):
    """
    Calculate multi likelihood of class c with x test
    """
    return x_test * np.log(c_likeli)[:, np.newaxis]
  def _predict(self, x_test) -> list:
    """
    Calculate sum likelihood and prior of class c.
    Argmax class c have posteriors.
    """
    posteriors = []
    for idx, c in enumerate(self._classes):
      prior_c = np.log(self._priors[idx])
      likelihood_c = x_test * self._likelihood[idx, :]
      posteriors_c = np.sum(likelihood_c) + prior_c
      posteriors.append(posteriors_c)
    return self._classes[np.argmax(posteriors)]
  def predict(self, X_test) -> list:
    """
    predict output for X test
    Input: csr_matrix
    Output: list
    """
    return [self._predict(x_test) for x_test in tqdm(X_test)]

In [None]:
clf = MultinomialNB()

In [None]:
clf.fit(tf_idf_vector, y_train)

In [None]:
y_pred = clf.predict(tf_idf_vector_test)

In [None]:
from sklearn.metrics import classification_report
target_names = np.unique(y_test)
print(classification_report(y_test, y_pred, target_names=target_names))

The method is straight forward. Just take the average of the precision and recall of the system on different sets.
A weighted average is the average of a data set that recognizes certain numbers as more important than others.


The model's Accuracy is: 0.38 but the f1-score of some very low classes is like News=0.0 and shopping=0.02. That the model has not predicted the classes with little data

In [None]:
array = confusion_matrix(y_test, y_pred)
cm=np.array(array)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
df_cm = pd.DataFrame(cm, index = [i for i in "0123456789ABCDE"],
                  columns = [i for i in "0123456789ABCDE"])
plt.figure(figsize = (20,15))
sns.heatmap(df_cm, annot=True)