In [1]:
import numpy as np
import pandas as pd

In [2]:
df_l = pd.read_csv('data/ag_news_csv/classes.txt', header=None)

In [3]:
categories = df_l.iloc[:,0]

categories = categories.values.tolist()

print(categories)

['World', 'Sports', 'Business', 'Sci/Tech']


In [31]:
# load the training dataset
df = pd.read_csv('data/ag_news_csv/train.csv', header=None)

#to shuffle data into pandas data frame
df_training = df.sample(frac=1, random_state=0)

In [32]:
labels_training, texts_training = df_training.iloc[:, 0], df_training.iloc[:, 2]

labels_training = np.array(labels_training.values.tolist())
texts_training = texts_training.values.tolist()

In [34]:
print(type(labels_training))
print(type(texts_training))

print(len(labels_training))
print(len(texts_training))

<class 'numpy.ndarray'>
<class 'list'>
120000
120000


## Tokenizing text with 

You can go with CountVectorizer. CharNGramAnalyzer function is depricated.

CountVectorizer v/s CharNGramAnalyzer -- | 
https://stackoverflow.com/a/46602813/4386025

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range = (1, 1))
X_train_counts = count_vect.fit_transform(texts_training)
X_train_counts.shape

(120000, 60741)

In [58]:
#it will print number of times given word is appeared. Here, "oil" is your word.
print(count_vect.vocabulary_.get('oil'))

38505


In [10]:
#From occurrences to frequencies

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(120000, 60741)

In [39]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(120000, 60741)

In [40]:
#Training a classifier

In [41]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, labels_training)

In [42]:
print(categories)

['World', 'Sports', 'Business', 'Sci/Tech']


In [50]:
#Testing Sample

docs_new = ['crude price', 'crude price 123']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, label_ in zip(docs_new, predicted):
    print('%r => %s' % (doc, categories[label_]))
    print('%r => %s' % (doc, label_))

'crude price' => Sci/Tech
'crude price' => 3
'crude price 123' => Sci/Tech
'crude price 123' => 3


## Building a pipeline

In [44]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [45]:
text_clf.fit(texts_training, labels_training)

## Evaluation of the performance on the test set

In [46]:
# load the training dataset
df_test = pd.read_csv('data/ag_news_csv/test.csv', header=None)

#to shuffle data into pandas data frame
df_test = df_test.sample(frac=1, random_state=0)

In [47]:
labels_test, texts_test = df_test.iloc[:, 0], df_test.iloc[:, 2]

labels_test = np.array(labels_test.values.tolist())
texts_test = texts_test.values.tolist()

In [48]:
predicted = text_clf.predict(texts_test)
np.mean(predicted == labels_test)

0.8935526315789474

## performance analysis

In [49]:
from sklearn import metrics
print(metrics.classification_report(labels_test, predicted, target_names=categories))

metrics.confusion_matrix(labels_test, predicted)

              precision    recall  f1-score   support

       World       0.91      0.89      0.90      1900
      Sports       0.95      0.97      0.96      1900
    Business       0.86      0.84      0.85      1900
    Sci/Tech       0.86      0.87      0.87      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600



array([[1687,   62,  101,   50],
       [  29, 1845,   11,   15],
       [  71,   21, 1605,  203],
       [  71,   23,  152, 1654]])