In [10]:
conda install nltk
import nltk
nltk.download()

In [161]:
#1. Preprocessing(reading the dataset, tokenization, stopwords elimination,
#   noise elimination, transforming big letters, stemming, lemmatizing, spelling mistakes correcting)
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from autocorrect import Speller

#stopwords
stwords = stopwords.words('english')
english_stopwords = set(stopwords.words('english'))

#stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

#corpus reader
corpus_path= 'F:\\Literatura za faks\\MASTER\\NLP\\Sekspir'
corpuses = PlaintextCorpusReader(corpus_path, '.*.txt')

#speller
speller = Speller(lang='en')

#raw text
shakespeare_text = corpuses.raw('sentences.txt')
modern_text = corpuses.raw('modern_sentences.txt')

#sentences
shakespeare_sentences = sent_tokenize(shakespeare_text)
modern_sentences = sent_tokenize(modern_text)

#remove non-text from sentences
#tokenize sentences
processed_shakespeare_sentences_s = []
processed_modern_sentences_s = []

processed_shakespeare_sentences_l = []
processed_modern_sentences_l = []

#processing all shakespeare sentences
for sentence in shakespeare_sentences:
    sent = re.sub('[^A-Za-z]', ' ', sentence) #remove non-text
    sent = sent.lower() #lower all letters
    sent = word_tokenize(sent) #tokenize sentence
    processed_sent_s = [] #processed sentences stemmer
    processed_sent_l = [] #processed sentences lemmatizer
    
    #processing one sentence
    for word in sent:
        spelled_word = speller(word)
        if spelled_word not in english_stopwords:
            transformed_word_s = stemmer.stem(spelled_word) #stemming
            transformed_word_l = lemmatizer.lemmatize(spelled_word) #lemmatizing
            processed_sent_s.append(transformed_word_s) #stemmed words from one sentence
            processed_sent_l.append(transformed_word_l) #lemmatized words from one sentence

    #wrap up words into a sentence
    processed_sent_s = " ".join(processed_sent_s)
    processed_sent_l = " ".join(processed_sent_l)
    
    #save the sentence
    processed_shakespeare_sentences_s.append(processed_sent_s)
    processed_shakespeare_sentences_l.append(processed_sent_l)

#processing all modern sentences
for sentence in modern_sentences:
    sent = re.sub('[^A-Za-z]', ' ', sentence) #remove non-text
    sent = sent.lower() #lower all letters
    sent = word_tokenize(sent) #tokenize sentence
    processed_sent_s = [] #processed sentences stemmer
    processed_sent_l = [] #processed sentences lemmatizer
    
    #processing one sentence
    for word in sent:
        #spelled_word = speller(word)
        spelled_word =word
        if spelled_word not in english_stopwords:
            transformed_word_s = stemmer.stem(spelled_word) #stemming
            transformed_word_l = lemmatizer.lemmatize(spelled_word) #lemmatizing
            processed_sent_s.append(transformed_word_s) #stemmed words from one sentence
            processed_sent_l.append(transformed_word_l) #lemmatized words from one sentence
    
    #wrap up words into a sentence
    processed_sent_s = " ".join(processed_sent_s)
    processed_sent_l = " ".join(processed_sent_l)
    
    #save the sentence
    processed_modern_sentences_s.append(processed_sent_s)
    processed_modern_sentences_l.append(processed_sent_l)

In [162]:
#2. Attribute extraction by using Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
matrix_s = CountVectorizer(max_features=250)
matrix_l = CountVectorizer(max_features=250)

processed_sentences_s = processed_shakespeare_sentences_s + processed_modern_sentences_s
processed_sentences_l = processed_shakespeare_sentences_l + processed_modern_sentences_l

X_s = matrix_s.fit_transform(processed_sentences_s).toarray()
X_l= matrix_l.fit_transform(processed_sentences_l).toarray()
y = ['shakespeare'] * len(processed_shakespeare_sentences_s) + ['modern'] * len(processed_modern_sentences_s)

#3. Naive Bayes Classification
feature_names_s = matrix_s.get_feature_names()
feature_names_l = matrix_l.get_feature_names()
# (features, label)

def generate_features_s(vector):
    dictionary = {}
    for ind, number in enumerate(vector):
        key = feature_names_s[ind]
        dictionary[key] = number
    return dictionary

def generate_features_l(vector):
    dictionary = {}
    for ind, number in enumerate(vector):
        key = feature_names_l[ind]
        dictionary[key] = number
    return dictionary

from nltk import NaiveBayesClassifier

X_nltk_s = []
X_nltk_l = []
for ind, x in enumerate(X_s):
    X_nltk_s.append((generate_features_s(x), y[ind]))
for ind, x in enumerate(X_l):
    X_nltk_l.append((generate_features_l(x), y[ind]))
    
#randomize set
import random
random.shuffle(X_nltk_s)
random.shuffle(X_nltk_l)
number_of_entries = len(X_nltk_s)
half_of_entries = number_of_entries // 2
train_set_s, test_set_s = X_nltk_s[half_of_entries:], X_nltk_s[:half_of_entries]
train_set_l, test_set_l = X_nltk_l[half_of_entries:], X_nltk_l[:half_of_entries]

print(train_set_s[0])
print(train_set_l[0])
classifierNLTK_s = NaiveBayesClassifier.train(train_set_s)
classifierNLTK_l = NaiveBayesClassifier.train(train_set_l)

({'ah': 0, 'alarm': 0, 'answer': 0, 'arm': 0, 'art': 0, 'ask': 0, 'away': 0, 'ay': 0, 'back': 0, 'bear': 0, 'beauti': 0, 'best': 0, 'better': 0, 'blood': 0, 'bodi': 0, 'boy': 0, 'brave': 0, 'break': 0, 'bring': 0, 'brother': 0, 'call': 0, 'care': 0, 'clifford': 0, 'come': 0, 'could': 0, 'cousin': 0, 'cri': 0, 'crown': 0, 'day': 0, 'dead': 0, 'death': 0, 'die': 0, 'done': 0, 'draw': 0, 'duke': 0, 'earl': 0, 'eight': 0, 'either': 0, 'els': 0, 'end': 0, 'enemi': 0, 'england': 0, 'enter': 0, 'er': 0, 'even': 0, 'event': 0, 'ever': 0, 'everi': 0, 'eye': 0, 'face': 0, 'faith': 0, 'fall': 0, 'far': 0, 'farewel': 0, 'father': 0, 'fear': 0, 'fight': 0, 'find': 0, 'first': 0, 'five': 0, 'fli': 0, 'follow': 0, 'foot': 0, 'four': 0, 'franc': 0, 'friend': 0, 'full': 0, 'game': 0, 'gentl': 0, 'get': 0, 'give': 0, 'gloucest': 0, 'go': 0, 'god': 0, 'good': 0, 'got': 1, 'grace': 0, 'great': 0, 'hand': 0, 'hang': 0, 'happi': 0, 'hast': 0, 'hath': 0, 'head': 0, 'hear': 0, 'heart': 0, 'heaven': 0, 'help':

In [163]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tf_s = TfidfVectorizer(max_features=250)
tf_l = TfidfVectorizer(max_features=250)

text_tf_s = tf_s.fit_transform(processed_sentences_s)
text_tf_l = tf_l.fit_transform(processed_sentences_l)

feature_names_tf_s = tf_s.get_feature_names()
feature_names_tf_l = tf_l.get_feature_names()

def generate_features_tf_s(vector):
    dictionary = {}
    for ind, number in enumerate(vector):
        key = feature_names_tf_s[ind]
        dictionary[key] = number
    return dictionary

def generate_features_tf_l(vector):
    dictionary = {}
    for ind, number in enumerate(vector):
        key = feature_names_tf_l[ind]
        dictionary[key] = number
    return dictionary

data_tf_s = []
data_tf_l = []
for i in range(text_tf_s.shape[0]):
    vector = []
    for j in range(text_tf_s.shape[1]):
        vector.append(text_tf_s[i,j])
    data_tf_s.append((generate_features_tf_s(vector), y[i]))
for i in range(text_tf_l.shape[0]):
    vector = []
    for j in range(text_tf_l.shape[1]):
        vector.append(text_tf_l[i,j])
    data_tf_l.append((generate_features_tf_l(vector), y[i]))
import random
random.shuffle(data_tf_s)
random.shuffle(data_tf_l)
train_set_tf_s, test_set_tf_s = data_tf_s[half_of_entries:], data_tf_s[:half_of_entries]
train_set_tf_l, test_set_tf_l = data_tf_l[half_of_entries:], data_tf_l[:half_of_entries]

classifier_s = nltk.classify.NaiveBayesClassifier.train(train_set_tf_s)
classifier_l = nltk.classify.NaiveBayesClassifier.train(train_set_tf_l)


In [164]:
print('Stemmer Bayes classificator accuracy')
nltk.classify.accuracy(classifierNLTK_s, test_set_s)

Stemmer Bayes classificator accuracy


0.836101660996598

In [165]:
print('Lemmatizer Bayes classificator accuracy')
nltk.classify.accuracy(classifierNLTK_l, test_set_l)

Lemmatizer Bayes classificator accuracy


0.8399039423654192

In [166]:
nltk.classify.accuracy(classifier_s, test_set_tf_s)


0.5207124274564738

In [167]:
nltk.classify.accuracy(classifier_l, test_set_tf_l)


0.5347208324994998

In [192]:
print(len(train_set_tf_s))

4997


In [193]:
classifierDT_s = nltk.classify.DecisionTreeClassifier.train(train_set_s)
classifierDT_l = nltk.classify.DecisionTreeClassifier.train(train_set_l)
classifierME_s = nltk.classify.MaxentClassifier.train(train_set_s, trace=0)
classifierME_l = nltk.classify.MaxentClassifier.train(train_set_l, trace=0)

In [194]:
classifierDT_s_IDF = nltk.classify.DecisionTreeClassifier.train(train_set_tf_s)


In [195]:
classifierDT_l_IDF = nltk.classify.DecisionTreeClassifier.train(train_set_tf_l)
classifierME_s_IDF = nltk.classify.MaxentClassifier.train(train_set_tf_s, trace=0)
classifierME_l_IDF = nltk.classify.MaxentClassifier.train(train_set_tf_l, trace=0)




In [196]:
nltk.classify.accuracy(classifierDT_s, test_set_s)


0.8206924154492695

In [197]:
nltk.classify.accuracy(classifierDT_l, test_set_l)


0.8332999799879928

In [198]:
nltk.classify.accuracy(classifierME_s, test_set_s)


0.7872723634180508

In [199]:
nltk.classify.accuracy(classifierME_l, test_set_l)


0.7808685211126676

In [200]:
nltk.classify.accuracy(classifierDT_s_IDF, test_set_tf_s)


0.5461276766059636

In [201]:
nltk.classify.accuracy(classifierDT_l_IDF, test_set_tf_l)


0.5875525315189113

In [202]:
nltk.classify.accuracy(classifierME_s_IDF, test_set_tf_s)


0.5481288773263958

In [203]:
nltk.classify.accuracy(classifierME_l_IDF, test_set_tf_l)


0.5539323594156494

In [223]:
#shakespeare
test_sentences1 = 'We will die all three: But I will prove that two kings are as good As him and me. '
#modern
test_sentences2 = 'Hello, how are you and what are you doing today, and will you go out with friends tonight? ' 
#modern
test_sentences3 = 'One girl going to see nine dolphins. '
#modern
test_sentences4 = 'In that he went too far away. ' 
#modern
test_sentences5 = 'AUTHORITIES CLAIMS THAT THE KILLINGS WERE THE crime OF ANGER. ' 
#modern
test_sentences6 = 'I was very little when i lost my tooth. '
#shakespeare
test_sentences7 = 'But, gracious sir, Here are your sons again, and I must lose Two of the sweet\'st companions in the world. '
#modern
test_sentences8 = 'King George didn\'t fear anything and decided that he is going to see that spectacle. '
#modern
test_sentences9 = 'George six didn\'t fear anything and he looked at something. '
#modern
test_sentences10= 'Panmure stabbing: Bradford Kipa pleads guilty to murder of John Tofu Ioane. '
#modern
test_sentences11= 'Panmure stabbing: Bradford Kipa pleads guilty to murder of John Tofu Ioane which died. '

test_sentences = test_sentences1 + test_sentences2 + test_sentences3 + test_sentences4 + test_sentences5 + test_sentences6 + test_sentences7
test_sentences = test_sentences + test_sentences8 + test_sentences9 + test_sentences10 + test_sentences11
def test_sentences_func(input_sentences):
    sentences = sent_tokenize(input_sentences)
    #remove non-text from sentences
    #tokenize sentences
    sents_features_and_values = []

    for sentence in sentences:
        sent = re.sub('[^A-Za-z]', ' ', sentence) #remove non-text
        sent = sent.lower() #lower all letters
        sent = word_tokenize(sent) #tokenize sentence
        features_and_values = {word : 0 for word in feature_names_l} #processed sentences stemmer
    
        #processing one sentence
        for word in sent:
            spelled_word = speller(word)
            if spelled_word not in english_stopwords:
                transformed_word = stemmer.stem(spelled_word) #stemming
                if transformed_word in feature_names_l:
                    features_and_values[transformed_word] = features_and_values[transformed_word] + 1
        sents_features_and_values.append(features_and_values)   
    for i in range(0, len(sentences)):
        res = classifierNLTK_l.classify(sents_features_and_values[i])
        print('Sentence number %d is' %(i + 1), res)
test_sentences_func(test_sentences)

Sentence number 1 is shakespeare
Sentence number 2 is modern
Sentence number 3 is modern
Sentence number 4 is modern
Sentence number 5 is modern
Sentence number 6 is modern
Sentence number 7 is shakespeare
Sentence number 8 is shakespeare
Sentence number 9 is modern
Sentence number 10 is modern
Sentence number 11 is shakespeare


In [208]:
classifierNLTK_l.show_most_informative_features(260)


Most Informative Features
                    king = 1              shakes : modern =     41.0 : 1.0
                    fear = 1              shakes : modern =     30.6 : 1.0
                   going = 1              modern : shakes =     29.6 : 1.0
                  talent = 1              shakes : modern =     27.9 : 1.0
               something = 1              modern : shakes =     24.1 : 1.0
                  looked = 1              modern : shakes =     21.5 : 1.0
                    mine = 1              shakes : modern =     20.8 : 1.0
                   peace = 1              shakes : modern =     20.5 : 1.0
                    fall = 1              shakes : modern =     19.8 : 1.0
                    soul = 1              shakes : modern =     18.5 : 1.0
                     set = 1              shakes : modern =     18.3 : 1.0
                 soldier = 1              shakes : modern =     17.8 : 1.0
                     son = 1              shakes : modern =     17.5 : 1.0