# Sentiment and data mining

In [5]:
import os
import numpy as np

# Load the dataset
train_texts = []
train_labels = []
test_texts = []
test_labels = []
for dset in ['train', 'test']:
    for cat in ['pos', 'neg']:
        dset_path = os.path.join('data', dset, cat)
        for fname in sorted(os.listdir(dset_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(dset_path, fname)) as f:
                    if dset == 'train': train_texts.append(f.read())
                    else: test_texts.append(f.read())
                label = 0 if cat == 'neg' else 1
                if dset == 'train': train_labels.append(label)
                else: test_labels.append(label)

# Converting to np.array
train_texts = np.array(train_texts)
train_labels = np.array(train_labels)
test_texts = np.array(test_texts)
test_labels = np.array(test_labels)

In [45]:
import pandas as pd
train_folder = 'data/train/'
labels = {'pos': 1, 'neg': 0}

train_data = pd.DataFrame()
   
for l in ('pos', 'neg'):
    path = os.path.join(train_folder, l)
    for file in os.listdir (path) :
        with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
            txt = infile.read()
        train_data = train_data.append([[txt, labels[l]]],ignore_index=True)
train_data.columns = ['review', 'sentiment']
train_data.to_csv('data/train_data.csv', sep=',', encoding='utf-8', index=False)


test_folder = 'data/test/'
test_data = pd.DataFrame()  
for l in ('pos', 'neg'):
    path = os.path.join(test_folder, l)
    for file in os.listdir (path) :
        with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
            txt = infile.read()
        test_data = test_data.append([[txt, labels[l]]],ignore_index=True)
test_data.columns =  ['review', 'sentiment']
test_data.to_csv('data/test_data.csv', sep=',', encoding='utf-8', index=False)

In [76]:
train_data = pd.read_csv('data/train_data.csv', sep=',', encoding='utf-8', header=0)
test_data = pd.read_csv('data/test_data.csv', sep=',', encoding='utf-8', header=0)

train_data['review'].head()
test_data['review'].head()

0    Based on an actual story, John Boorman shows t...
1    This is a gem. As a Film Four production - the...
2    I really like this show. It has drama, romance...
3    This is the best 3-D experience Disney has at ...
4    Of the Korean movies I've seen, only three had...
Name: review, dtype: object

In [78]:
import re
from nltk.tokenize import RegexpTokenizer
STOPWORDS = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()

def preprocess_reviews(review):
    review = REPLACE_NO_SPACE.sub("", review.lower())
    review = REPLACE_WITH_SPACE.sub(" ", review)
    review = tokenizer.tokenize(review)
    
    review = [i for i in review if not i in STOPWORDS]

    # stem tokens
    review = [p_stemmer.stem(i) for i in review]
    
    
    return(" ".join(review)) 


train_data['review'] = train_data['review'].apply(preprocess_reviews)
test_data['review'] = test_data['review'].apply(preprocess_reviews)
print(train_data['review'].head())
print(test_data['review'].head())

0    movi get respect sure lot memor quot list gem ...
1    bizarr horror movi fill famou face stolen cris...
2    solid unremark film matthau einstein wonder fa...
3    strang feel sit alon theater occupi parent rol...
4    probabl alreadi know 5 addit episod never air ...
Name: review, dtype: object
0    base actual stori john boorman show struggl am...
1    gem film four product anticip qualiti inde del...
2    realli like show drama romanc comedi roll one ...
3    best 3 experi disney themepark certainli bette...
4    korean movi ive seen three realli stuck first ...
Name: review, dtype: object


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='word', ngram_range=(2,2), min_df = 0)

print(cv.fit_transform(train_texts))

  (0, 665027)	1
  (0, 1217275)	1
  (0, 951807)	1
  (0, 1380846)	1
  (0, 449168)	1
  (0, 439118)	1
  (0, 654713)	1
  (0, 1257771)	1
  (0, 37755)	1
  (0, 827448)	1
  (0, 876476)	1
  (0, 31798)	1
  (0, 765459)	1
  (0, 1217881)	1
  (0, 424171)	1
  (0, 579229)	1
  (0, 1278236)	1
  (0, 1373879)	1
  (0, 1174186)	1
  (0, 1202947)	1
  (0, 1435609)	1
  (0, 882238)	1
  (0, 895600)	1
  (0, 1054146)	1
  (0, 1283916)	1
  :	:
  (24999, 108710)	1
  (24999, 1251683)	1
  (24999, 854560)	1
  (24999, 525249)	1
  (24999, 657212)	1
  (24999, 613689)	1
  (24999, 1283996)	1
  (24999, 880745)	1
  (24999, 1358691)	1
  (24999, 1432091)	1
  (24999, 242811)	1
  (24999, 728745)	1
  (24999, 118027)	1
  (24999, 1216819)	1
  (24999, 63226)	1
  (24999, 667757)	1
  (24999, 1260970)	2
  (24999, 415214)	1
  (24999, 1281969)	1
  (24999, 613840)	1
  (24999, 188196)	1
  (24999, 895600)	1
  (24999, 1241567)	1
  (24999, 880731)	1
  (24999, 627398)	1


In [15]:
analyze = cv.build_analyzer()

['futz is',
 'is the',
 'the only',
 'only show',
 'show preserved',
 'preserved from',
 'from the',
 'the experimental',
 'experimental theatre',
 'theatre movement',
 'movement in',
 'in new',
 'new york',
 'york in',
 'in the',
 'the 1960s',
 '1960s the',
 'the origins',
 'origins of',
 'of off',
 'off off',
 'off broadway',
 'broadway though',
 'though it',
 'it not',
 'not for',
 'for everyone',
 'everyone it',
 'it is',
 'is genuinely',
 'genuinely brilliant',
 'brilliant darkly',
 'darkly funny',
 'funny even',
 'even more',
 'more often',
 'often deeply',
 'deeply disturbing',
 'disturbing tale',
 'tale about',
 'about love',
 'love sex',
 'sex personal',
 'personal liberty',
 'liberty and',
 'and revenge',
 'revenge serious',
 'serious morality',
 'morality tale',
 'tale even',
 'even more',
 'more relevant',
 'relevant now',
 'now in',
 'in time',
 'time when',
 'when congress',
 'congress wants',
 'wants to',
 'to outlaw',
 'outlaw gay',
 'gay marriage',
 'marriage by',
 'by