In [1]:
import nltk
from nltk.corpus import treebank

In [2]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [3]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [4]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

In [5]:
treebank_sents = list(treebank.tagged_sents(tagset='universal'))

In [6]:
train_tagged_words = [tup for sent in treebank_sents for tup in sent]

In [7]:
tokens = [pair[0] for pair in train_tagged_words]
V = set(tokens)

In [8]:
T = set([pair[1] for pair in train_tagged_words])
len(T)

12

In [9]:
import numpy as np
import pandas as pd

In [10]:
t = len(T)
v = len(V)

In [11]:
from collections import defaultdict

In [12]:
def create_dictionaries(training_corpus):

    # initialize the dictionaries using defaultdict
    emission_counts = defaultdict(int)
    transition_counts = defaultdict(int)
    tag_counts = defaultdict(int)

    # Initialize "prev_tag" (previous tag) with the start state, denoted by '.'
    prev_tag = '.'

    # use 'i' to track the line number in the corpus
    i = 0

    # Each item in the training corpus contains a word and its POS tag
    for word_tag in training_corpus:

        # Increment the word_tag count
        i += 1

        # Every 50,000 words, print the word count
        if i % 50000 == 0:
            print(f"word count = {i}")


        word, tag = word_tag[0],word_tag[1]

        # Increment the transition count for the previous word and tag
        transition_counts[(prev_tag, tag)] += 1

        # Increment the emission count for the tag and word
        emission_counts[(tag, word)] += 1

        # Increment the tag count
        tag_counts[tag] += 1

        # Set the previous tag to this tag (for the next iteration of the loop)
        prev_tag = tag



    return emission_counts, transition_counts, tag_counts

In [13]:
emission_counts, transition_counts, tag_counts = create_dictionaries(train_tagged_words)

word count = 50000
word count = 100000


In [14]:
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [15]:
tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)):
        tags_matrix[i, j] = (t2_given_t1(t2, t1)[0]+1)/(t2_given_t1(t2, t1)[1]+len(T)) # Add-1 Laplace smoothing for transition probability

In [16]:
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))

In [17]:
import math

In [18]:
def Viterbi(words,V,T):
    state = []
    T = list(T)
    best_probs = np.zeros((len(T), len(words)))
    best_paths = np.zeros((len(T), len(words)), dtype=int)
    for a,tag in enumerate(T):
      best_probs[a,0]=math.log(tags_df.loc['.', tag])+math.log((emission_counts[(tag,words[0])]+1)/(tag_counts[tag]+len(V))) #Add-1 Laplace smoothing to
                                                                                                                              #to calculate emission probability
    for i in range(1, len(words)):

        # For each unique POS tag that the current word can be
      for j,tagc in enumerate(T): # complete this line

            # Initialize best_prob for word i to negative infinity
          best_prob_i = float("-inf")

            # Initialize best_path for current word i to None
          best_path_i = None

            # For each POS tag that the previous word can be:
          for k,tagp in enumerate(T):

                # Calculate the probability (Applied log transformation as without log trasformation, tags after certain words turned out to be same,possibly due to long length of reviews which leads to very small values after a certain point, which are approximated as zero)
              prob = best_probs[k, i - 1]+math.log(tags_df.loc[tagp,tagc])+math.log((emission_counts[(tagc,words[i])]+1)/(tag_counts[tagc]+len(V)))

                # check if this path's probability is greater than
                # the best probability up to and before this point
              if prob > best_prob_i:

                    # Keep track of the best probability
                  best_prob_i = prob

                    # keep track of the POS tag of the previous word
                    # that is part of the best path.
                    # Save the index (integer) associated with
                    # that previous word's POS tag
                  best_path_i = k

            # Save the best probability for the
            # given current word's POS tag
            # and the position of the current word inside the corpus(words)
          best_probs[j,i] = best_prob_i

            # Save the unique integer ID of the previous POS tag
            # into best_paths matrix, for the POS tag of the current word
            # and the position of the current word inside the corpus(words).
          best_paths[j,i] = best_path_i

    m = best_paths.shape[1]

    # Initialize array z, same length as the corpus(words)
    z = [None] * m

    # Get the number of unique POS tags
    num_tags = best_probs.shape[0]

    # Initialize the best probability for the last word
    best_prob_for_last_word = float('-inf')

    # Initialize pred array, same length as corpus(words)
    pred = [None] * m


    # Go through each POS tag for the last word (last column of best_probs)
    # in order to find the row (POS tag integer ID)
    # with highest probability for the last word
    for k in range(num_tags):

        # If the probability of POS tag at row k
        # is better than the previosly best probability for the last word:
        if best_probs[k, m - 1] > best_prob_for_last_word:

            # Store the new best probability for the last word
            best_prob_for_last_word = best_probs[k, m - 1]

            # Store the unique integer ID of the POS tag
            # which is also the row number in best_probs
            z[m - 1] = k

    # Convert the last word's predicted POS tag
    # from its unique integer ID into the string representation
    # using the 'states'(Tags) list
    # store this in the 'pred' array for the last word
    pred[m - 1] = T[k]

    # Find the best POS tags by walking backward through the best_paths
    # From the last word in the corpus to the 0th word in the corpus
    for i in range(len(words) - 1, -1, -1):

        # Retrieve the unique integer ID of
        # the POS tag for the word at position 'i' in the corpus
        pos_tag_for_word_i = z[i]


        z[i - 1] = best_paths[pos_tag_for_word_i, i]


        pred[i - 1] = T[z[i - 1]]

    return pred

In [19]:
import nltk
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [21]:
categories = movie_reviews.categories()
documents = [(movie_reviews.raw(fileid), category) for category in categories for fileid in movie_reviews.fileids(category)]

In [22]:
def convert(f):
  return ([i for i in f.split()])

In [23]:
train_words=[]
for sent,_ in documents:
  train_words.append(convert(sent))

In [24]:
train_words[0]

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an',
 'accident',
 '.',
 'one',
 'of',
 'the',
 'guys',
 'dies',
 ',',
 'but',
 'his',
 'girlfriend',
 'continues',
 'to',
 'see',
 'him',
 'in',
 'her',
 'life',
 ',',
 'and',
 'has',
 'nightmares',
 '.',
 "what's",
 'the',
 'deal',
 '?',
 'watch',
 'the',
 'movie',
 'and',
 '"',
 'sorta',
 '"',
 'find',
 'out',
 '.',
 '.',
 '.',
 'critique',
 ':',
 'a',
 'mind-fuck',
 'movie',
 'for',
 'the',
 'teen',
 'generation',
 'that',
 'touches',
 'on',
 'a',
 'very',
 'cool',
 'idea',
 ',',
 'but',
 'presents',
 'it',
 'in',
 'a',
 'very',
 'bad',
 'package',
 '.',
 'which',
 'is',
 'what',
 'makes',
 'this',
 'review',
 'an',
 'even',
 'harder',
 'one',
 'to',
 'write',
 ',',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'which',
 'attempt',
 'to',
 'break',
 'the',
 'mold',
 ',',
 'mess',
 'with',
 'your',
 'head',
 'and',
 

In [25]:
len(train_words[0])

825

In [26]:
s=Viterbi(train_words[0],V,T)

In [27]:
s

['NOUN',
 '.',
 'NUM',
 'X',
 'PRT',
 'VERB',
 'PRT',
 'DET',
 'NOUN',
 'NOUN',
 '.',
 'NOUN',
 'CONJ',
 'ADV',
 'VERB',
 '.',
 'PRON',
 'VERB',
 'ADP',
 'DET',
 'NOUN',
 '.',
 'NUM',
 'ADP',
 'DET',
 'NOUN',
 'NOUN',
 '.',
 'CONJ',
 'PRON',
 'VERB',
 'VERB',
 'PRT',
 'VERB',
 'X',
 'ADP',
 'PRON',
 'NOUN',
 '.',
 'CONJ',
 'VERB',
 'X',
 '.',
 'ADP',
 'DET',
 'NOUN',
 '.',
 'VERB',
 'DET',
 'NOUN',
 'CONJ',
 'VERB',
 'X',
 'PRT',
 'VERB',
 'ADP',
 '.',
 '.',
 '.',
 'NOUN',
 '.',
 'DET',
 'ADJ',
 'NOUN',
 'ADP',
 'DET',
 'ADJ',
 'NOUN',
 'ADP',
 'NOUN',
 'ADP',
 'DET',
 'ADV',
 'ADJ',
 'NOUN',
 '.',
 'CONJ',
 'VERB',
 'PRON',
 'ADP',
 'DET',
 'ADV',
 'ADJ',
 'NOUN',
 '.',
 'DET',
 'VERB',
 'PRON',
 'VERB',
 'DET',
 'NOUN',
 'DET',
 'ADV',
 'ADJ',
 'NOUN',
 'PRT',
 'VERB',
 '.',
 'ADP',
 'DET',
 'ADJ',
 'NOUN',
 'ADP',
 'DET',
 'NOUN',
 'PRT',
 'VERB',
 'DET',
 'NOUN',
 '.',
 'NOUN',
 'ADP',
 'PRON',
 'NOUN',
 'CONJ',
 'ADJ',
 'NOUN',
 'VERB',
 'NOUN',
 'CONJ',
 'ADJ',
 'NOUN',
 '.',
 'C

In [None]:
train_words1=[]
for i in train_words:
  temp=Viterbi(i,V,T)
  train_words1.append(temp)

In [None]:
import csv
with open("nlp.csv","w") as f:
  wr=csv.writer(f)
  wr.writerows(train_words1)

In [30]:
import csv
file='nlp.csv'
with open(file) as f:
  reader=csv.reader(f)
  lst=list(reader)

In [31]:
train_words1=lst

In [32]:
train_labels=[]
for sent,_ in documents:
  train_labels.append(_)

In [33]:
train_labels

['neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',


In [34]:
train_labels1 = [(label == 'pos') for label in train_labels]

In [35]:
train_labels1

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,


In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
train_words1, train_labels1, test_size=0.2, random_state=42)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
train_words, train_labels1, test_size=0.2, random_state=42)

In [39]:
X_train1,X_val1,y_train1,y_val1=train_test_split(
X_train1, y_train1, test_size=0.1, random_state=42)

In [40]:
X_train,X_val,y_train,y_val=train_test_split(
X_train, y_train, test_size=0.1, random_state=42)

In [41]:
len(X_train[0])

969

In [42]:
vectorizer = TfidfVectorizer(max_features=400)
X_train = vectorizer.fit_transform([' '.join(words) for words in X_train]).toarray()
X_val = vectorizer.transform([' '.join(words) for words in X_val]).toarray()
X_test = vectorizer.transform([' '.join(words) for words in X_test]).toarray()

In [43]:
vectorizer = TfidfVectorizer(max_features=200)
X_train1 = vectorizer.fit_transform([' '.join(words) for words in X_train1]).toarray()
X_val1 = vectorizer.transform([' '.join(words) for words in X_val1]).toarray()
X_test1 = vectorizer.transform([' '.join(words) for words in X_test1]).toarray()

In [44]:
X_train1[0]

array([0.22575036, 0.36354604, 0.05863646, 0.12908976, 0.43097796,
       0.70070567, 0.01811127, 0.10847745, 0.07628031, 0.31956869])

In [45]:
X_train3=np.hstack((X_train,X_train1))

In [46]:
X_val3=np.hstack((X_val,X_val1))
X_test3=np.hstack((X_test,X_test1))

In [47]:
X_train3.shape

(1440, 410)

In [48]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
SVCClf = SVC(kernel = 'poly',degree=3, gamma = 'scale',)

# Train a classifier (Naive Bayes)
classifier = MultinomialNB()
#classifier.fit(X_train, y_train)
SVCClf.fit(X_train3, y_train)

# Predict
#predictions = classifier.predict(X_val)
predictions1 = SVCClf.predict(X_val3)

In [49]:
from sklearn.metrics import classification_report
print(classification_report(y_val, predictions1))

              precision    recall  f1-score   support

       False       0.88      0.83      0.85        87
        True       0.81      0.86      0.83        73

    accuracy                           0.84       160
   macro avg       0.84      0.85      0.84       160
weighted avg       0.85      0.84      0.84       160



In [50]:
predictions = SVCClf.predict(X_test3)

In [51]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       False       0.79      0.80      0.79       199
        True       0.80      0.79      0.79       201

    accuracy                           0.79       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.79      0.79      0.79       400

