In [None]:
!pip -qq install indic-nlp-library    # Import Indic NLP library
from indicnlp.tokenize import indic_tokenize

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
!unzip translated_dataset

Archive:  FInal_dataset.zip
replace neg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 
error:  invalid response [{ENTER}]
replace neg? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace pos? [y]es, [n]o, [A]ll, [N]one, [r]ename: a
error:  invalid response [a]
replace pos? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
FInal_dataset.zip  neg	pos  sample_data


In [None]:
def word_tok(sents, num_sents):
    # Word tokenize each sentence
    tokenized_sents = []
    for sentence in sents[:num_sents]:
        tokenized_sent = indic_tokenize.trivial_tokenize(sentence)
        tokenized_sents.append(tokenized_sent)
    
    # Return array of array of tokens
    print(f"{len(tokenized_sents)} sentences tokenized. First 5 sentences:\n\t{tokenized_sents[:5]}")
    return tokenized_sents

# Read data into memory
pos_data = []
with open("pos", 'r') as f:
    lines = f.readlines()

for line in lines:
    line = line.strip()
    pos_data.append(line)

pos_lines = len(pos_data)
print(f"{pos_lines} lines of pos reviews read")

neg_data = []
with open("neg", 'r') as f:
    lines = f.readlines()

for line in lines:
    line = line.strip()
    neg_data.append(line)

neg_lines = len(neg_data)
print(f"{neg_lines} lines of neg reviews read")


# Word tokenize
pos_tokenized = word_tok(pos_data, len(pos_data))
neg_tokenized = word_tok(neg_data, len(neg_data))

14143 lines of pos reviews read
12882 lines of neg reviews read
14143 sentences tokenized. First 5 sentences:
	[['शानदार', ',', 'सभी', 'गायन', ',', 'सभी', 'नाच', 'दावत'], ['इस', 'फिल्म', 'का', 'रहस्य'], ['एक', 'चूसने', 'वाले', 'को', 'कभी', 'भी', 'ब्रेक', 'न', 'दें'], ['सुंदर', ',', 'सार्थक', ',', 'मजाकिया', ',', 'उदास', 'और', 'हमेशा', 'प्रासंगिक'], ['बॉलीवुड', 'के', 'साथ', 'प्यार', 'में', 'पड़ना', 'चाहते', 'हैं', '?', 'यहाँ', 'से', 'प्रारंभ', 'करें']]
12882 sentences tokenized. First 5 sentences:
	[['अतिरंजित', 'और', 'कमज़ोर', '।'], ['बस', 'एक', 'दुखद', 'कहानी'], ['यथोचित', 'अच्छी', 'हिंदी', 'फिल्म'], ['यहाँ', 'और', 'वास्तव', 'में', 'काफी', 'भयानक', '!'], ['गाइड', 'ने', '1956', 'की', 'फिल्म', 'बारिशवाला', 'से', 'प्रेरणा', 'ली']]


In [None]:
# vectorizer = CountVectorizer()

data = pos_data + neg_data
targets = [1 for i in range(pos_lines)] + [0 for i in range(neg_lines)]

x_train, x_test, y_train, y_test = train_test_split(data, targets, test_size=0.25, random_state=42, shuffle=True)

# counts = vectorizer.fit_transform(data)
# counts

In [None]:
# transformer = TfidfTransformer(smooth_idf=False)
# tfidf = transformer.fit_transform(counts)
# tfidf

<27025x3649 sparse matrix of type '<class 'numpy.float64'>'
	with 92440 stored elements in Compressed Sparse Row format>

In [None]:
# clf = MultinomialNB().fit(x_train, y_train)

<14143x2367 sparse matrix of type '<class 'numpy.float64'>'
	with 49024 stored elements in Compressed Sparse Row format>

In [None]:
svm_clf = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge',
                                           penalty='l2',
                                           alpha=1e-3,
                                           random_state=42,
                                           max_iter=5,
                                           tol=None)),
])

svm_clf.fit(x_train, y_train)
predicted = svm_clf.predict(x_test)
np.mean(predicted == y_test)

0.7584726949829806

In [None]:
nb_clf = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

nb_clf.fit(x_train, y_train)
predicted = nb_clf.predict(x_test)
np.mean(predicted == y_test)

0.7433772384194169

In [None]:
dt_clf = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', DecisionTreeClassifier()),
])

dt_clf.fit(x_train, y_train)
predicted = dt_clf.predict(x_test)
np.mean(predicted == y_test)

0.7384934142370875