In [15]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.util import ngrams
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gjber\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gjber\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gjber\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
positive_path = '../Data 401 Project 2/aclImdb/train/pos/'
negative_path = '../Data 401 Project 2/aclImdb/train/neg/'

In [3]:
print("Number of texts in positive",len(os.listdir(positive_path)))
print("Number of texts in negative",len(os.listdir(negative_path)))


Number of texts in positive 12500
Number of texts in negative 12500


Now for the actual feature engineering. This can go many ways:

1. bag of words
2. word2vec
3. tri-gram
4. etc.
 
However, the overarching goal of this project is to classify the sentiment of text using linear classifiers.
As such, it appears that a bag of words approach will catch many important predictors (words like good, bad, love, hate etc.). However, this type of analysis will severely inflate the number of variables in the model. 

Possible work arounds could be stemming (or lemmatizing) words, removing stop words (these usually capture style and not sentiment), and only using words as predictors if they are in the top quartile of word frequencies or something of that nature.

Looking out for more succinct and creative ways to capture this sentiment is also perhaps an avenue worth pursuing.

## Read in data

In [4]:
#read text files from train folder
pos_train_txt = []
pos_train_label = []

for file_name in os.listdir(positive_path):
#     if file_name == '.ipynb_checkpoints':
#         continue
    data = open(positive_path + file_name, encoding='utf-8').read()
    pos_train_txt.append(data)
    pos_train_label.append('pos')
    
neg_train_txt = []
neg_train_label = []
for file_name in os.listdir(negative_path):
#     if file_name == '.ipynb_checkpoints':
#         continue
    data = open(negative_path + file_name, encoding='utf-8').read()
    neg_train_txt.append(data)
    neg_train_label.append('neg')

In [27]:
# Create a pandas dataframe from the text
train_pos = pd.DataFrame({'text':pos_train_txt,'label':pos_train_label})
train_neg = pd.DataFrame({'text':neg_train_txt,'label':neg_train_label})
train = train_pos.append(train_neg)
train.head()

Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,pos
1,Homelessness (or Houselessness as George Carli...,pos
2,Brilliant over-acting by Lesley Ann Warren. Be...,pos
3,This is easily the most underrated film inn th...,pos
4,This is not the typical Mel Brooks film. It wa...,pos


Begin Feature Engineering

In [28]:
# sentiment analysis

analyzer = SentimentIntensityAnalyzer()
train['score'] = train.text.apply(lambda x: nltk.sent_tokenize(x))
train.score = train.score.apply(lambda x: [analyzer.polarity_scores(sentence)['compound'] for sentence in x])

In [29]:
#taking the mean sentiment of sentence
train.score = train.score.apply(lambda x:  np.mean(x))

In [30]:
# Word tokenize first
train.text = train.text.apply(lambda x: nltk.word_tokenize(x))

In [31]:
stop_words = set(stopwords.words())
# Remove stop words and lower case remaining
# Note that order of sentence is lost in this implementation
# The set difference is much faster than alternatives for removing stop words, though
train.text = train.text.apply(lambda x: list(set(x).difference(stop_words)))
train.text = train.text.apply(lambda x: [word.lower() for word in x])

In [32]:
save_train = train.copy()

In [34]:
# First put the txt in a format that sklearn's CountVectorizer can use it\n
train['joinedtxt'] = train.text.apply(lambda x: ' '.join(x))
train['listtxt'] = train.joinedtxt.tolist()

In [36]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train.listtxt)
print(vectorizer.get_feature_names()[:5])

['00', '000', '0000000000001', '00001', '00015']


In [37]:
bag_of_words_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [38]:
keep_columns = bag_of_words_df.columns[bag_of_words_df.sum(axis = 0) > 100]

In [39]:
keep_columns = list(keep_columns)

In [40]:
train[keep_columns] = bag_of_words_df[keep_columns]

In [41]:
train.head()

Unnamed: 0,text,label,score,joinedtxt,listtxt,000,10,100,11,12,...,york,you,young,younger,your,youth,zero,zombie,zombies,zone
0,0,pos,0,teachers 'm . pathetic my student pity ... fin...,teachers 'm . pathetic my student pity ... fin...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,pos,0,young pictures fumes written already cause div...,young pictures fumes written already cause div...,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,pos,0,"scenes , scene lawyer says second all-time war...","scenes , scene lawyer says second all-time war...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,pos,0,truly traditionally . easily my someone homele...,truly traditionally . easily my someone homele...,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,pos,0,without fantastic . there audience followable ...,without fantastic . there audience followable ...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
train.to_csv('initial_feature_set.csv')

In [None]:
# # Lemmatize the remaining words
# lemmatizer = WordNetLemmatizer()
# train.text = train.text.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])