## Sentiment Analysis on test data
- Training is based on the pos.txt and neg.txt files.
- First, our data is in language/word format, not numerical form, which we need be converted to a vector of features.
- Second realization: our texts may not be the same length of words or characters.
- One option we have is to compile a list of all unique words in the training set. Let's say that's 3,500 unique words. These words are our lexicon.We check to see if a given input word of a text is in our unique word vector. If so, the index value of that word in the unique word index is set to 1 in the training vector. This is a very simple bag-of-words model.

In [17]:
#import our dependencies
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import pickle
import random
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [18]:
lemmatizer = WordNetLemmatizer()
hm_lines = 100000

In [19]:
#create a function to create lexicon for our feature vector
def create_lexicon(pos,neg):
    lexicon = []
    for fi in [pos,neg]:
        with open(fi,'r') as f:
            contents = f.readlines()
            for l in contents[:hm_lines]:
                all_words = word_tokenize(l.lower())
                lexicon+=list(all_words)
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_count = Counter(lexicon)
    l2 = []
    for w in w_count:
        if 1000>w_count[w]>10:
            l2.append(w)
    print(len(l2))
    return l2

In [20]:
def sample_handling(sample,lexicon,classification):
    featureset = []
    with open(sample,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_values] +=1
            features = list(features)
            featureset.append([features, classification])
    return featureset

In [21]:
# classification = [pos_value,neg_value] ,i.e, for a positive sample, classification = [1,0]
def create_feature_sets_and_labels(pos,neg,test_size = 0.1):
    lexicon = create_lexicon(pos,neg)
    features = []
    features +=sample_handling('pos.txt',lexicon,[1,0])
    features += sample_handling('neg.txt',lexicon,[0,1])
    random.shuffle(features)
    #for statistical reasons and for better training, we can't have all positive and negative training examples together
    features = np.array([features])
    testing_size = int(test_size*len(features))
    train_x = list(features[:,0][:-testing_size])
    train_y = list(features[:,1][:-testing_size])
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])
    return train_x,train_y,test_x,test_y

In [22]:
if __name__ == '__main__':
    train_X,train_y,test_x,test_y = create_feature_sets_and_labels('pos.txt','neg.txt')
    with open('sentiment_set.pickle','wb') as f:
        pickle.dump([train_x,train_y,test_x,test_y],f)

LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/english.pickle' not found.  Please
  use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - 'C:\\Users\\Sarthak Srivastava/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\Users\\Sarthak Srivastava\\Anaconda3\\nltk_data'
    - 'C:\\Users\\Sarthak Srivastava\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Sarthak Srivastava\\AppData\\Roaming\\nltk_data'
    - ''
**********************************************************************