In [19]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.util import ngrams
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
positive_path = '/data401/reviews/train/pos/'
negative_path = '/data401/reviews/train/neg/'

In [21]:
print("Number of texts in positive",len(os.listdir(positive_path)))
print("Number of texts in negative",len(os.listdir(negative_path)))


Number of texts in positive 12501
Number of texts in negative 12501


Now for the actual feature engineering. This can go many ways:

1. bag of words
2. word2vec
3. tri-gram
4. etc.
 
However, the overarching goal of this project is to classify the sentiment of text using linear classifiers.
As such, it appears that a bag of words approach will catch many important predictors (words like good, bad, love, hate etc.). However, this type of analysis will severely inflate the number of variables in the model. 

Possible work arounds could be stemming (or lemmatizing) words, removing stop words (these usually capture style and not sentiment), and only using words as predictors if they are in the top quartile of word frequencies or something of that nature.

Looking out for more succinct and creative ways to capture this sentiment is also perhaps an avenue worth pursuing.

## Read in data

In [8]:
#read text files from train folder
pos_train_txt = []
pos_train_label = []

for file_name in os.listdir(positive_path):
    if file_name == '.ipynb_checkpoints':
        continue
    data = open(positive_path + file_name, encoding='utf-8').read()
    pos_train_txt.append(data)
    pos_train_label.append('pos')
    
neg_train_txt = []
neg_train_label = []
for file_name in os.listdir(negative_path):
    if file_name == '.ipynb_checkpoints':
        continue
    data = open(negative_path + file_name, encoding='utf-8').read()
    neg_train_txt.append(data)
    neg_train_label.append('neg')

In [22]:
# Create a pandas dataframe from the text
train_pos = pd.DataFrame({'text':pos_train_txt,'label':pos_train_label})
train_neg = pd.DataFrame({'text':neg_train_txt,'label':neg_train_label})
train = train_pos.append(train_neg)
train.head()

Unnamed: 0,text,label
0,The Shining is a weird example of adaptation: ...,pos
1,In 1967 I visited the Lake Elsinore glider-por...,pos
2,I think Hollow Point is a funny film with some...,pos
3,I notice the DVD version seems to have missing...,pos
4,Men of Honor has many great aspects to it. Goo...,pos


## Let's begin by removing stop words and lemmatizing the rest.

- lemmatizing over stemming is chosen because it should produce a smaller subset of features

In [17]:
train.iloc[1]['text']

"In 1967 I visited the Lake Elsinore glider-port and flew a yellow Pratt Read sailplane. Returning to Germany the above serious ran on TV and one segment was about the high altitude sailplane flights in California in the early 50ies. (The real life pilot was Bill Ivans, I don't know who played him in the series) It turned out that the sailplane in the film was the same (same N-number) as the one I had flown at Lake Elsinore. Ever since I saw that segment I have been searching for it and have been wondering if it is somewhere available. (other segments in that serious were about the Baker Ejection Seat; an instrument to find avalanche victims etc."

In [None]:
# sentiment analysis

analyzer = SentimentIntensityAnalyzer()
train['score'] = train.text.apply(lambda x: nltk.sent_tokenize(x))
train.score = train.score.apply(lambda x: [analyzer.polarity_scores(sentence)['compound'] for sentence in x])

In [12]:
#taking the mean sentiment of sentence

train.score = train.score.apply(lambda x:  np.mean(x))

In [13]:
# Word tokenize first
train.text = train.text.apply(lambda x: nltk.word_tokenize(x))

In [None]:
# # Remove stop words
train.text = train.text.apply(lambda x: [word for word in x if word not in stopwords.words()])

In [None]:
# Lemmatize the remaining words
lemmatizer = WordNetLemmatizer()
train.text = train.text.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])