In [1]:
import string
import pickle
import pathlib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
def preprocessing(text):
    tokens = word_tokenize(text)

    tokens = [token for token in tokens if token not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

In [3]:
BASE_DIR = pathlib.Path().resolve(strict=True).parent

CSV_DIR = BASE_DIR / 'csv'
CSV_FILE = CSV_DIR / 'dataset.csv'

MODEL_DIR = BASE_DIR / 'model'
VECTOR_DIR = BASE_DIR / 'vectorizer'

MODEL_FILE = MODEL_DIR / 'model.pkl'
VECTOR_FILE = VECTOR_DIR / 'vectorizer.pkl'

MODEL_DIR.mkdir(parents=True, exist_ok=True)
VECTOR_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
df = pd.read_csv(CSV_FILE)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.loc[df['sentiment'] == 'positive', 'sentiment'] = 1
df.loc[df['sentiment'] == 'negative', 'sentiment'] = 0
print(df)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...         1
1      A wonderful little production. <br /><br />The...         1
2      I thought this was a wonderful way to spend ti...         1
3      Basically there's a family where a little boy ...         0
4      Petter Mattei's "Love in the Time of Money" is...         1
...                                                  ...       ...
49995  I thought this movie did a down right good job...         1
49996  Bad plot, bad dialogue, bad acting, idiotic di...         0
49997  I am a Catholic taught in parochial elementary...         0
49998  I'm going to have to disagree with the previou...         0
49999  No one expects the Star Trek movies to be high...         0

[50000 rows x 2 columns]


In [6]:
reviews = df['review']
processed_reviews = reviews.apply(lambda i: preprocessing(i))
df['processed'] = processed_reviews
df.head()

Unnamed: 0,review,sentiment,processed
0,One of the other reviewers has mentioned that ...,1,"[One, reviewer, mentioned, watching, 1, Oz, ep..."
1,A wonderful little production. <br /><br />The...,1,"[A, wonderful, little, production, br, br, The..."
2,I thought this was a wonderful way to spend ti...,1,"[I, thought, wonderful, way, spend, time, hot,..."
3,Basically there's a family where a little boy ...,0,"[Basically, 's, family, little, boy, Jake, thi..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"[Petter, Mattei, 's, ``, Love, Time, Money, ''..."


In [7]:
x = df['processed']
y = df['sentiment'].astype(int)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
x_train = [' '.join(review) for review in x_train]
x_test = [' '.join(review) for review in x_test]

In [9]:
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [10]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
y_pred = lr.predict(x_test)

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8837333333333334


In [13]:
with open(MODEL_FILE, 'wb') as f:
    pickle.dump(lr, f)

In [14]:
with open(VECTOR_FILE, 'wb') as f:
    pickle.dump(vectorizer,f)