In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

In [2]:
dbpedia_df = pd.read_csv('../building-features-from-text-data/datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [3]:
dbpedia_df.shape



(559999, 3)

### DBPedia classes

- Company
- EducationalInstitution
- Artist
- Athlete
- OfficeHolder
- MeanOfTransportation
- Building
- NaturalPlace
- Village
- Animal
- Plant
- Album
- Film
- WrittenWork

In [4]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [5]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [6]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
380616,10,Stilifer akahitode,Stilifer akahitode is a species of sea snail ...
500198,13,Ip Man 2,Ip Man 2 is a 2010 Hong Kong biographical mar...
88377,3,Achuta Manasa,Achuta Manasa is an Indian kuchipudi dancer.
377145,10,Bombyx incomposita,Bombyx incomposita is a moth in the Bombycida...
29060,1,W. W. Greener,W.W. Greener is a sporting shotgun and rifle ...
198749,5,Robert E. Holmes,Robert Edward Holmes (November 14 1922 – July...
488653,13,My Giant,My Giant is a 1998 comedy drama film starring...
320765,9,Lockland Ohio,Lockland is a village in Hamilton County Ohio...
7583,1,Fresh to Order,Fresh to Order (f2o) is an Atlanta Fine fast ...
95113,3,Donald Deskey,Donald Deskey (November 23 1894 – April 29 19...


In [7]:
dbpedia_df.shape

(10000, 3)

In [8]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [9]:
X.head()

37295      Malibu Comics (also known as Malibu Graphics)...
543193     Data & Knowledge Engineering (DKE) is a journ...
221689     USS Swordfish (SS-193) a Sargo-class submarin...
374962     Dioryctria is a genus of snout moths. It was ...
240744     Midsummer House is a restaurant located in Ca...
Name: Text, dtype: object

In [10]:
Y.head()

37295      1
543193    14
221689     6
374962    10
240744     7
Name: Label, dtype: int64

In [11]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [12]:
count_vectorizer = CountVectorizer()

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

(10000, 47912)

In [13]:
print(feature_vector[0])

  (0, 25727)	4
  (0, 10069)	3
  (0, 3015)	1
  (0, 22973)	2
  (0, 4138)	1
  (0, 17865)	1
  (0, 44316)	2
  (0, 3249)	1
  (0, 3150)	1
  (0, 10066)	1
  (0, 6616)	1
  (0, 33404)	1
  (0, 2204)	1
  (0, 20470)	3
  (0, 41304)	5
  (0, 23891)	1
  (0, 793)	1
  (0, 3321)	3
  (0, 13674)	1
  (0, 806)	1
  (0, 5866)	1
  (0, 16106)	1
  (0, 21167)	1
  (0, 42818)	1
  (0, 24564)	1
  (0, 29944)	1
  (0, 40128)	1
  (0, 41698)	2
  (0, 29511)	1
  (0, 42877)	1
  (0, 23582)	1
  (0, 20509)	2
  (0, 26866)	1
  (0, 6248)	2
  (0, 42815)	1
  (0, 29227)	1
  (0, 25786)	1
  (0, 14980)	1
  (0, 37256)	1
  (0, 10150)	1
  (0, 18936)	1
  (0, 7837)	1
  (0, 7874)	1
  (0, 20452)	1
  (0, 2632)	1
  (0, 14758)	1


In [14]:
X_dense = feature_vector.todense()

In [15]:
X_dense.shape

(10000, 47912)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [17]:
x_train.shape, x_test.shape

((8000, 47912), (2000, 47912))

In [18]:
y_train.shape, y_test.shape

((8000,), (2000,))

In [19]:
clf = GaussianNB().fit(x_train, y_train)

TypeError: np.matrix is not supported. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html

In [21]:
y_pred = clf.predict(x_test)
y_pred

array([ 5, 10,  4, ...,  5,  1, 14])

In [22]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1464
accuracy_score :  0.732
precision_score :  0.7480649828741391
recall_score :  0.732
