In [12]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import pandas as pd

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow import keras
import numpy as np
from sklearn import naive_bayes

import nltk
nltk.download('words')
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import words

from nltk.stem import PorterStemmer,LancasterStemmer

stemmer = LancasterStemmer()

trans_table = {ord(c): None for c in string.punctuation + string.digits} 

def tokenize(text):
        # my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
        tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if re.match(r'[a-zA-Z][a-zA-Z]+', word)] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
        stems = [stemmer.stem(item) for item in tokens]
        return stems

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\PRADEEP\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PRADEEP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
# DBPEDIA_test.xlsx
# text_classification_dataset.xlsx
# news_articles.xlsx
df = pd.read_excel("DBPEDIA_train.xlsx",nrows=50000)
X = df.loc[:,"Text"]
y = df.loc[:,"Type"]

In [14]:
len(df)

50000

In [15]:
classes = y.unique().tolist()
print(len(classes))
print(classes)

9
['Agent', 'Work', 'Place', 'Species', 'UnitOfWork', 'Event', 'SportsSeason', 'Device', 'TopicalConcept']


In [16]:
for i in range(0,len(y)):
    y[i] = classes.index(y[i]) 

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05 , random_state=1 , stratify=y)

In [18]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))


47500
2500
47500
2500


In [19]:
#tfidfvectorizer = TfidfVectorizer(tokenizer=tokenize,analyzer='word' , stop_words='english', min_df=0.005, max_df= 0.9)
tfidfvectorizer = TfidfVectorizer(analyzer='word' , stop_words='english',token_pattern = r'[a-zA-Z][a-zA-Z]+')

tfidfvectorizer.fit(X_train)

TfidfVectorizer(stop_words='english', token_pattern='[a-zA-Z][a-zA-Z]+')

In [20]:
print(len(tfidfvectorizer.get_feature_names()))
print(tfidfvectorizer.get_feature_names())

156365


In [21]:
features = tfidfvectorizer.get_feature_names()
print(len(tfidfvectorizer.get_feature_names()))
print(len(features))
new_voc = features

156365
156365


In [22]:
eng_words = words.words()
print(len(eng_words))
print(len(features))

236736
156365


In [23]:
features_filter = list(set(features) & set(eng_words))
len(features_filter)

27422

In [24]:
voc = features_filter
len(voc)

27422

In [25]:
vectorizer = TfidfVectorizer(vocabulary=list(set(voc)))
vectorizer._validate_vocabulary()
#print(vectorizer1.get_feature_names())
voc_filter = vectorizer.fit_transform(X_train)
voc_filter_arr = voc_filter.toarray()
#print(voc_filter[0])
#print(len(vectorizer.get_feature_names()))
#print(len(voc_filter_arr))

In [26]:
column_add = voc_filter_arr.sum(axis=0)       
len(column_add) 

27422

In [27]:
print(type(column_add))
print(sum(column_add))
print(column_add)

<class 'numpy.ndarray'>
192895.5321812419
[ 0.98591873  0.41598959  0.30695097 ...  0.18847677  0.93692081
 38.41490155]


In [28]:
column_add = column_add / len(voc_filter_arr)
print(type(column_add))
print(sum(column_add))
print(column_add)

<class 'numpy.ndarray'>
4.060958572236675
[2.07561838e-05 8.75767568e-06 6.46212565e-06 ... 3.96793202e-06
 1.97246485e-05 8.08734769e-04]


In [29]:
avg = sum(column_add) / len(column_add) 
print(str(avg))

0.00014809126147752444


In [30]:
cnt = 0
new_voc = [] 
for col in range(0,len(vectorizer.get_feature_names())):
     if column_add[col] > avg :
            cnt = cnt + 1
            new_voc.append(vectorizer.get_feature_names()[col])

print(len(vectorizer.get_feature_names()))
print(len(new_voc))

27422
4067


In [31]:
vectorizer_final_train = TfidfVectorizer(tokenizer=tokenize,vocabulary=list(set(new_voc)))#["hello","how","you","are"])
vectorizer_final_train._validate_vocabulary()
#print(vectorizer1.get_feature_names())

train_result = vectorizer_final_train.fit_transform(X_train)
train_result_arr = train_result.toarray()



vectorizer_final_test = TfidfVectorizer(tokenizer=tokenize,vocabulary=list(set(new_voc)))#["hello","how","you","are"])
vectorizer_final_test._validate_vocabulary()

test_result = vectorizer_final_test.fit_transform(X_test)
test_result_arr = test_result.toarray()

print(len(vectorizer_final_test.get_feature_names()))
print(len(vectorizer_final_train.get_feature_names()))


print(len(test_result_arr[0]))
print(len(test_result_arr))
print(len(y_test))

print(len(train_result_arr[0]))
print(len(train_result_arr))
print(len(y_train))

4067
4067
4067
2500
2500
4067
47500
47500


In [32]:
model = keras.Sequential([
    keras.layers.InputLayer(len(train_result_arr[0])),  # input layer (1)
    keras.layers.Dense(1000, activation='relu'),  # hidden layer (2)
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(len(classes), activation='softmax') # output layer (3)
])

In [33]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy']) 

In [34]:
model.fit(train_result_arr, np.asarray(y_train).astype('float32'), epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x219c742dd90>

In [26]:

test_loss, test_acc = model.evaluate(test_result_arr,  np.asarray(y_test).astype('float32'), verbose=1) 



In [35]:
 model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              4068000   
_________________________________________________________________
dense_1 (Dense)              (None, 100)               100100    
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 909       
Total params: 4,169,009
Trainable params: 4,169,009
Non-trainable params: 0
_________________________________________________________________


In [298]:
result = model.predict(test_result_arr)

In [None]:
def print_result(ind):
    print("Text : "+np.asarray(X_test)[ind]+"\n")
    np.set_printoptions(precision=10)
    np.set_printoptions(suppress=True)
    print(result[ind])
    print("\nPredicted : "+ classes[np.where(result[ind] == max(result[ind]))[0][0]])
    print("Actual    : " + classes[np.asarray(y_test)[ind]])
    print("Accuracy  : "+str(max(result[ind])*100))
    print("\n--------------------------------------------------------------------------------------------------------------\n")

for i in range(0,len(X_test)):
    print_result(i)

In [61]:
from nltk.stem import PorterStemmer,LancasterStemmer

word_stemmer = PorterStemmer()
Lanc_stemmer = LancasterStemmer()


In [75]:
#application', 'applications', 'applied', 'applies', 'apply', 'applying
strg = "runer"
print(word_stemmer.stem(strg))
print(Lanc_stemmer.stem(strg))

runer
run


In [25]:
from sklearn import metrics
classifier = naive_bayes.MultinomialNB()
import joblib
classifier.fit(train_result_arr, np.asarray(y_train).astype('float32'))
joblib.dump(classifier, "NBModel.jb") 

    
# predict the labels on validation dataset
predictions = classifier.predict(test_result_arr)

acc = metrics.accuracy_score(predictions, np.asarray(y_test).astype('float32'))

print(predictions)
print(np.asarray(y_test).astype('float32'))
print(acc)
    


[0. 0. 0. ... 0. 0. 0.]
[0. 3. 0. ... 0. 4. 0.]
0.7515


In [26]:
NBModel = joblib.load("NBModel.jb")

predictions = NBModel.predict(test_result_arr)

acc = metrics.accuracy_score(predictions, np.asarray(y_test).astype('float32'))

print(predictions)
print(np.asarray(y_test).astype('float32'))
print(acc)

[0. 0. 0. ... 0. 0. 0.]
[0. 3. 0. ... 0. 4. 0.]
0.7515


In [34]:
from autocorrect import Speller
spell = Speller(lang='en')

In [39]:
spell('s1x')

's1x'

In [40]:
from spellchecker import SpellChecker
spell = SpellChecker()

In [70]:
spell.correction("m0untaln")

'mountain'

In [None]:
#X_train, X_test, y_train, y_test

In [5]:
from sklearn import metrics
classifier = naive_bayes.MultinomialNB()
import joblib
classifier.fit(X_train,y_train)
joblib.dump(classifier, "NBModel.jb") 

ValueError: could not convert string to float: 'The Golden State Collegiate Baseball League (GSCBL) is a collegiate wood bat baseball league based out of San Jose, California. It is an 8-team league that was created in 2012 following the folding of the Pacific West Baseball League and features teams from California, Oregon and Nevada. It is run just like any professional baseball league, but only showcases the top college athletes that gives professional scouts a chance to see prospects. The GSCBL has one division with each franchise playing a 42-game schedule, but the teams are encouraged to play 8 to 10 non-league contests to get around 50 games a season.'

In [24]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import joblib
pipe = make_pipeline(TfidfVectorizer(), MultinomialNB())
pipe.fit(X_train,y_train)


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [25]:
pipe.predict(["HirfanlÄ± Dam is a dam in Turkey. The development was backed by the Turkish State Hydraulic Works. It was built by Wimpey Construction and was completed in 1959."])

array(['Place'], dtype='<U14')

In [26]:
joblib.dump(pipe, "NBModel.jb") 

['NBModel.jb']

In [27]:
NBModel = joblib.load("NBModel.jb")

In [31]:
NBModel.predict(["The 2008 Pennsylvania Republican primary was an election held on April 22 by the Pennsylvania Department of State in which voters chose their preference for the Republican Party's candidate for the 2008 U.S. Presidential election. Voters also chose the Pennsylvania Republican Party's candidates for various state and local offices. The selected candidates were placed on the ballot of the 2008 General Election on November 4, 2008. The Republican primary was part of a General Primary that also included the 2008 Pennsylvania Democratic primary. Polls opened at 7:00 am and closed at 8:00 pm. John McCain was the winner. He had already been declared the presumptive Republican Presidential nominee, having secured enough delegate votes in earlier primary contests to win the nomination at the 2008 Republican National Convention."])

array(['Agent'], dtype='<U14')