# Sentiment and data mining

## Task 1

In [1]:
import pandas as pd
import os
import numpy as np
from nltk.corpus import stopwords

train_folder = 'data/train/'
labels = {'pos': 1, 'neg': 0}

train_data = pd.DataFrame()
   
for l in ('pos', 'neg'):
    path = os.path.join(train_folder, l)
    for file in os.listdir (path) :
        with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
            txt = infile.read()
        train_data = train_data.append([[txt, labels[l]]],ignore_index=True)
train_data.columns = ['review', 'sentiment']
train_data.to_csv('data/train_data.csv', sep=',', encoding='utf-8', index=False)


test_folder = 'data/test/'
test_data = pd.DataFrame()  
for l in ('pos', 'neg'):
    path = os.path.join(test_folder, l)
    for file in os.listdir (path) :
        with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
            txt = infile.read()
        test_data = test_data.append([[txt, labels[l]]],ignore_index=True)
test_data.columns =  ['review', 'sentiment']
test_data.to_csv('data/test_data.csv', sep=',', encoding='utf-8', index=False)

In [2]:
train_data = pd.read_csv('data/train_data.csv', sep=',', encoding='utf-8', header=0)
test_data = pd.read_csv('data/test_data.csv', sep=',', encoding='utf-8', header=0)

print(train_data['review'].head())
print(test_data['review'].head())

0    For a movie that gets no respect there sure ar...
1    Bizarre horror movie filled with famous faces ...
2    A solid, if unremarkable film. Matthau, as Ein...
3    It's a strange feeling to sit alone in a theat...
4    You probably all already know this by now, but...
Name: review, dtype: object
0    Based on an actual story, John Boorman shows t...
1    This is a gem. As a Film Four production - the...
2    I really like this show. It has drama, romance...
3    This is the best 3-D experience Disney has at ...
4    Of the Korean movies I've seen, only three had...
Name: review, dtype: object


In [3]:
import re
from nltk.tokenize import RegexpTokenizer
STOPWORDS = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

def preprocess_reviews(review):
    review = REPLACE_NO_SPACE.sub("", review.lower())
    review = REPLACE_WITH_SPACE.sub(" ", review)
    review = tokenizer.tokenize(review)
    
    review = [i for i in review if not i in STOPWORDS]

    # stem tokens
    review = [stemmer.stem(i) for i in review]
    
    
    return(" ".join(review)) 


x_train = train_data['review'].copy().apply(preprocess_reviews)
y_train = train_data['sentiment']
x_test = test_data['review'].copy().apply(preprocess_reviews)
y_test = train_data['sentiment']
print(train_data['review'].head())
print(test_data['review'].head())

0    For a movie that gets no respect there sure ar...
1    Bizarre horror movie filled with famous faces ...
2    A solid, if unremarkable film. Matthau, as Ein...
3    It's a strange feeling to sit alone in a theat...
4    You probably all already know this by now, but...
Name: review, dtype: object
0    Based on an actual story, John Boorman shows t...
1    This is a gem. As a Film Four production - the...
2    I really like this show. It has drama, romance...
3    This is the best 3-D experience Disney has at ...
4    Of the Korean movies I've seen, only three had...
Name: review, dtype: object


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='word', ngram_range=(2,2), min_df = 1, max_df = 1)

movie_counts = cv.fit_transform(x_train)

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
movie_tfidf = tfidf_transformer.fit_transform(movie_counts)

## Task 2

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

review_clf = Pipeline([('vect', CountVectorizer(analyzer='word', ngram_range=(2,2), min_df = 0)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

parameters = {
    'vect__ngram_range': [(2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}
clf = GridSearchCV(review_clf, parameters, cv=5)
clf.fit(x_train, y_train)
print(classification_report(y_test, clf.predict(x_test), digits=4))

              precision    recall  f1-score   support

           0     0.8331    0.8936    0.8623     12500
           1     0.8853    0.8210    0.8519     12500

   micro avg     0.8573    0.8573    0.8573     25000
   macro avg     0.8592    0.8573    0.8571     25000
weighted avg     0.8592    0.8573    0.8571     25000



In [10]:
# output the scores again for improved model
from sklearn.metrics import accuracy_score, f1_score
predictions = clf.predict(x_test)
print("Accuracy Score:", accuracy_score(y_test,predictions))
print("F1 Score", f1_score(y_test,predictions))

Accuracy Score: 0.85732
F1 Score 0.8519486987921804


## Task 3
Get some wrongly classified reviews which are not alonger then 500 chars long

In [11]:
predicted_frame = pd.DataFrame(predictions, columns=["predicted"])
act_pred_frame = pd.concat([test_data["review"],test_data["sentiment"],predicted_frame["predicted"]], axis=1)
act_pred_frame = act_pred_frame[act_pred_frame["review"].str.len() <= 500]
sampled_wrong_class = act_pred_frame[act_pred_frame["sentiment"] != 
                                     act_pred_frame["predicted"]].sample(frac=1).head()
print(sampled_wrong_class.head())

                                                  review  sentiment  predicted
11333  This is my favorite Mel Brooks movie because i...          1          0
23618  To begin with its a rip off of the Japanese fi...          0          1
738    I just wanted to say that. I love Gheorghe Mur...          1          0
21719  This movie is overrated, to say the least. It'...          0          1
12114  Fate puts a pair of priceless items in Ernest'...          1          0


In [12]:
# use lime for easy understanding why something got wrongly classified
from lime import lime_text
from sklearn.pipeline import make_pipeline

from lime.lime_text import LimeTextExplainer

class_names = ['negative','positive']
explainer = LimeTextExplainer(class_names=class_names)

for index, row in sampled_wrong_class.iterrows():
    exp = explainer.explain_instance(x_test[index], clf.predict_proba)
    path = 'lime/' + str(index) + '.html'
    exp.save_to_file(path, text=False)

## Predict unlabeled reviews (optional)

In [14]:
unlabeled_folder = 'data/train/unsup'
unlabeled_data = pd.DataFrame() 
path = os.path.join(unlabeled_folder)
for file in os.listdir (path) :
    with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
        txt = infile.read()
    unlabeled_data = unlabeled_data.append([[txt]],ignore_index=True)
unlabeled_data.columns = ['review']    

In [16]:
x_unlabeled = unlabeled_data['review'].copy().apply(preprocess_reviews)
unlbl_predictions = clf.predict(x_unlabeled)

In [17]:
unlbl_predictions_frame = pd.DataFrame(unlbl_predictions, columns=["predicted"])
act_pred_frame = pd.concat([unlabeled_data["review"],unlbl_predictions_frame["predicted"]], axis=1)
unlbl_predictions_frame.to_csv('data/unlabeled_predictions.csv', sep=',', encoding='utf-8', index=False)