In [41]:
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [4]:
# CSV - Comma Separated Values - id,name,age
# TSV - Tab Separated Values - id   name   age
imdb_reviews = pd.read_csv('reviews/imdb_labelled.txt', sep="\t", header=None)

In [5]:
imdb_reviews.head()

Unnamed: 0,0,1
0,A very very very slow moving aimless movie abo...,0
1,Not sure who was more lost the flat character...,0
2,Attempting artiness with black white and clev...,0
3,Very little music or anything to speak of,0
4,The best scene in the movie was when Gerardo i...,1


In [6]:
amazon_reviews = pd.read_csv('reviews/amazon_cells_labelled.txt', sep="\t", header=None)
yelp_reviews = pd.read_csv('reviews/yelp_labelled.txt', sep="\t", header=None)

In [7]:
imdb_reviews.shape

(1000, 2)

In [8]:
amazon_reviews.shape

(1000, 2)

In [9]:
yelp_reviews.shape

(1000, 2)

In [10]:
df = pd.DataFrame()
df = df.append(imdb_reviews).append(yelp_reviews).append(amazon_reviews)

  df = df.append(imdb_reviews).append(yelp_reviews).append(amazon_reviews)


In [11]:
df.shape

(3000, 2)

In [12]:
df.head()

Unnamed: 0,0,1
0,A very very very slow moving aimless movie abo...,0
1,Not sure who was more lost the flat character...,0
2,Attempting artiness with black white and clev...,0
3,Very little music or anything to speak of,0
4,The best scene in the movie was when Gerardo i...,1


In [15]:
df.columns = ['Review', 'Sentiment']

In [16]:
df.head()

Unnamed: 0,Review,Sentiment
0,A very very very slow moving aimless movie abo...,0
1,Not sure who was more lost the flat character...,0
2,Attempting artiness with black white and clev...,0
3,Very little music or anything to speak of,0
4,The best scene in the movie was when Gerardo i...,1


In [25]:
def textProcessing(df):
    table = str.maketrans('','',string.punctuation)
    for i in range(len(df)):
        df['Review'].iloc[i] = df['Review'].iloc[i].lower().translate(table)
    
    documents = []
    # word tokenization
    for i in range(len(df)):
        documents.append(word_tokenize(df['Review'].iloc[i]))
        
    englishStopwords = stopwords.words("english")
    words = []
    for tokens in documents:
        word = []
        for i in range(len(tokens)):
            if tokens[i] not in englishStopwords:
                word.append(tokens[i])
        words.append(word)
        
    wnet = WordNetLemmatizer()
    for i in range(len(words)):
        for j in range(len(words[i])):
            words[i][j] = wnet.lemmatize(words[i][j], 'v')
            
    for i in range(len(words)):
        words[i] = " ".join(words[i])
    
    return words

In [26]:
words = textProcessing(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Review'].iloc[i] = df['Review'].iloc[i].lower().translate(table)


In [28]:
words[:5]

['slow move aimless movie distress drift young man',
 'sure lose flat character audience nearly half walk',
 'attempt artiness black white clever camera angle movie disappoint become even ridiculous act poor plot line almost nonexistent',
 'little music anything speak',
 'best scene movie gerardo try find song keep run head']

In [29]:
len(words)

3000

In [30]:
tfidf = TfidfVectorizer()

In [31]:
vector = tfidf.fit_transform(words)

In [32]:
vector

<3000x4531 sparse matrix of type '<class 'numpy.float64'>'
	with 18125 stored elements in Compressed Sparse Row format>

In [33]:
vector = vector.toarray()

In [34]:
vector

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
x_train, x_test, y_train, y_test = train_test_split(vector, df['Sentiment'], test_size=0.25)

In [36]:
x_train.shape

(2250, 4531)

In [37]:
x_test.shape

(750, 4531)

In [38]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)

LogisticRegression()

In [39]:
y_pred = logistic.predict(x_test)

In [40]:
accuracy_score(y_test, y_pred)

0.8013333333333333

In [42]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

MultinomialNB()

In [43]:
y_pred = nb.predict(x_test)
accuracy_score(y_test, y_pred)

0.8266666666666667