In [None]:
#importing required packages
import numpy as np
import pandas as pd

#reading the dataset
dataset = pd.read_csv('/content/drive/My Drive/corpus.csv', encoding='latin-1')
dataset.head(5)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,__label__2
1,The best soundtrack ever to anything.: I'm re...,__label__2
2,Amazing!: This soundtrack is my favorite musi...,__label__2
3,Excellent Soundtrack: I truly like this sound...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After H...",__label__2


In [None]:
#importing other required packages
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
#setting the random seed for output reproducibility
np.random.seed(500)

In [None]:
#data_preprocessing steps

#Step-1 => Remove blank rows if any
dataset['text'].dropna(inplace=True)

#Step-2 => Tokenize the given text in the dataset and change it to lowercase
#for entry in dataset['text']:
  #tokens_list = list()
  #tokens = word_tokenize(str(entry))
  #for token in tokens:
    #token = token.lower
    #tokens_list.append(token)
  #dataset['text'] = tokens_list

dataset['text'] = [entry.lower() for entry in dataset['text']]
dataset['text'] = [word_tokenize(entry) for entry in dataset['text']]

#WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(dataset['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    dataset.loc[index,'text_final'] = str(Final_words)

In [None]:
#preparing the training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(dataset['text_final'], dataset['label'], test_size=0.3)

In [None]:
#encoding the labels
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
Y_test = encoder.fit_transform(Y_test)

In [None]:
#word vectorization procedure
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_vect.fit(dataset['text_final'])
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
print(tfidf_vect.vocabulary_)



In [None]:
print(X_train_tfidf)

  (0, 4502)	0.3763188267807246
  (0, 4501)	0.15031494427382475
  (0, 3974)	0.35868777245753825
  (0, 3890)	0.2515140235472667
  (0, 3858)	0.2690675584422277
  (0, 3748)	0.34695623926050195
  (0, 3658)	0.2896999547088821
  (0, 3561)	0.29449641491430995
  (0, 2922)	0.229683025366997
  (0, 1940)	0.13406125327954532
  (0, 1536)	0.17761496997588844
  (0, 517)	0.321056290554803
  (0, 488)	0.12303572865008613
  (0, 238)	0.2448559358109696
  (1, 4687)	0.21384275526442909
  (1, 4069)	0.3566872275481094
  (1, 3434)	0.21279175847748263
  (1, 3319)	0.8157357261127677
  (1, 2595)	0.2173336717856602
  (1, 1252)	0.2074693534878867
  (1, 598)	0.1614401835472762
  (2, 4734)	0.21251405574612364
  (2, 4621)	0.17383471522304228
  (2, 4464)	0.11898591577849023
  (2, 4197)	0.13515537469996092
  :	:
  (6998, 2522)	0.11512409752599596
  (6998, 2130)	0.13650214385741868
  (6998, 1976)	0.07126908030410523
  (6998, 1788)	0.22013355385880556
  (6998, 1755)	0.19935027840675415
  (6998, 1719)	0.13508979239544552
  

In [None]:
dataset.head(10)

Unnamed: 0,text,label,text_final
0,"[stuning, even, for, the, non-gamer, :, this, ...",__label__2,"['stun', 'even', 'sound', 'track', 'beautiful'..."
1,"[the, best, soundtrack, ever, to, anything, .,...",__label__2,"['best', 'soundtrack', 'ever', 'anything', 're..."
2,"[amazing, !, :, this, soundtrack, is, my, favo...",__label__2,"['amaze', 'soundtrack', 'favorite', 'music', '..."
3,"[excellent, soundtrack, :, i, truly, like, thi...",__label__2,"['excellent', 'soundtrack', 'truly', 'like', '..."
4,"[remember, ,, pull, your, jaw, off, the, floor...",__label__2,"['remember', 'pull', 'jaw', 'floor', 'hear', '..."
5,"[an, absolute, masterpiece, :, i, am, quite, s...",__label__2,"['absolute', 'masterpiece', 'quite', 'sure', '..."
6,"[buyer, beware, :, this, is, a, self-published...",__label__1,"['buyer', 'beware', 'book', 'want', 'know', 'r..."
7,"[glorious, story, :, i, loved, whisper, of, th...",__label__2,"['glorious', 'story', 'love', 'whisper', 'wick..."
8,"[a, five, star, book, :, i, just, finished, re...",__label__2,"['five', 'star', 'book', 'finish', 'read', 'wh..."
9,"[whispers, of, the, wicked, saints, :, this, w...",__label__2,"['whisper', 'wicked', 'saint', 'easy', 'read',..."


In [None]:
print(dataset['text_final'])

0       ['stun', 'even', 'sound', 'track', 'beautiful'...
1       ['best', 'soundtrack', 'ever', 'anything', 're...
2       ['amaze', 'soundtrack', 'favorite', 'music', '...
3       ['excellent', 'soundtrack', 'truly', 'like', '...
4       ['remember', 'pull', 'jaw', 'floor', 'hear', '...
                              ...                        
9995    ['revelation', 'life', 'small', 'town', 'ameri...
9996    ['great', 'biography', 'interesting', 'journal...
9997    ['interest', 'subject', 'poor', 'presentation'...
9998    ['buy', 'box', 'look', 'use', 'obviously', 'ne...
9999    ['beautiful', 'pen', 'fast', 'delivery', 'pen'...
Name: text_final, Length: 10000, dtype: object


In [None]:
#classification using Naive_Bayes algorithm
nb_classifier = naive_bayes.MultinomialNB()
nb_classifier.fit(X_train_tfidf, Y_train)

#predicting the labels on the validation set
nb_predictions = nb_classifier.predict(X_test_tfidf)

#displaying the accuracy score
print("Naive_Bayes Accuracy Score: {}".format(accuracy_score(nb_predictions, Y_test)*100))

Naive_Bayes Accuracy Score: 83.1


In [None]:
#classification using SVM algorithm
svm_classifier = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_classifier.fit(X_train_tfidf, Y_train)

#predicting the labels on the validation set
svm_predictions = svm_classifier.predict(X_test_tfidf)

#displaying the accuracy score
print("SVM Accuracy Score: {}".format(accuracy_score(svm_predictions, Y_test)*100))

SVM Accuracy Score: 84.7


In [None]:
#plotting the histogram
import pandas as pd
import seaborn as sb
from matplotlib import pyplot as plt

sb.distplot(accuracy_score((nb_predictions, Y_test)*100), accuracy_score((svm_predictions, Y_test)*100))
plt.show()

  import pandas.util.testing as tm


TypeError: ignored