In [11]:
import random
from collections import Counter

import numpy as np
from googletrans import Translator
from nltk.tokenize import word_tokenize
import codecs

In [12]:
hm_lines = 5000000
translator = Translator()
stopwords = codecs.open("hindi_stopwords.txt", "r", encoding='utf-8', errors='ignore').read().split('\n')

In [13]:
def create_lexicon(pos_hin, neg_eng, pos_eng, neg_hin):
    lexicon = []
    for file_name in [pos_hin, neg_eng, pos_eng, neg_hin]:
        with codecs.open(file_name, 'r',encoding='utf-8',errors='ignore') as f:
            contents = f.read()
            for line in contents.split('$'):
                data = line.strip('\n')
                if data:
                    all_words = word_tokenize(data)
                    lexicon += list(all_words)
    lexicons = []
    for word in lexicon:
        if not word in stopwords:
            lexicons.append(word)
    word_counts = Counter(lexicons)  # it will return kind of dictionary
    l2 = []
    for word in word_counts:
        if 60 > word_counts[word]:
            l2.append(word)
    return l2

In [14]:
def sample_handling(sample, lexicon, classification):
    featureset = []
    with codecs.open(sample, 'r', encoding="utf8",errors='ignore') as f:
        contents = f.read()
        for line in contents.split('$'):
            data = line.strip('\n')
            if data:
                all_words = word_tokenize(data)
                all_words_new = []
                for word in all_words:
                    if not word in stopwords:
                        all_words_new.append(word)
                features = np.zeros(len(lexicon))
                for word in all_words_new:
                    if word in lexicon:
                        idx = lexicon.index(word)
                        features[idx] = 1
                features = list(features)
                featureset.append([features, classification])
    return featureset

In [1]:
def create_feature_set_and_labels(pos_hin, neg_eng, pos_eng, neg_hin, test_size=0.2):
    lexicon = create_lexicon(pos_hin, neg_eng, pos_eng, neg_neg_hin)
    features = []
    features += sample_handling(pos_hin, lexicon, 1)
    features += sample_handling(neg_eng, lexicon, 0)
    features += sample_handling(pos_eng, lexicon, 1)
    features += sample_handling(neg_hin, lexicon, 0)
    random.shuffle(features)
    features = np.array(features)
    #print(len(features))
    testing_size = int((1 - test_size) * len(features))

    x_train = list(features[:, 0][:testing_size])  # taking features array upto testing_size
    y_train = list(features[:, 1][:testing_size])  # taking labels upto testing_size

    x_test = list(features[:, 0][testing_size:])
    y_test = list(features[:, 1][testing_size:])
    return x_train, y_train, x_test, y_test

In [17]:
def check_class(text, lexicon):
    line = translator.translate(text, dest='hi').text
    classifier = SupervisedDBNClassification.load('dbn.pkl')
    predict_set = []
    all_words = word_tokenize(line)
    # all_words = [lemmatizer.lemmatize(i) for i in all_words]
    features = np.zeros(len(lexicon))
    for word in all_words:
        if word in lexicon:
            idx = lexicon.index(word)
            features[idx] += 1
    features = list(features)
    predict_set.append(features)
    predict_set = np.array(predict_set, dtype=np.float32)
    predict_set = classifier.predict(predict_set)
    #print(predict_set)

In [18]:
def create_feature_set_and_labels_simple(pos, neg, test_size=0.2):
    lexicon = create_lexicon(pos, neg)
    features = []
    features += sample_handling(pos, lexicon, [1, 0])
    features += sample_handling(neg, lexicon, [0, 1])
    random.shuffle(features)
    features = np.array(features)
    #print(len(features))
    testing_size = int((1 - test_size) * len(features))

    x_train = list(features[:, 0][:testing_size])  
    y_train = list(features[:, 1][:testing_size])  

    x_test = list(features[:, 0][testing_size:])
    y_test = list(features[:, 1][testing_size:])
    return x_train, y_train, x_test, y_test

In [21]:
if __name__ == '__main__':
    create_lexicon('pos_hindi.txt', 'neg_hindi.txt', 'pos_eng.txt', 'neg_eng.txt')