Import dataset '**imdb-reviews.csv**' from google drive and use pandas to parse.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# imports
import pandas
import re
import nltk
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
nltk.download('omw-1.4')
from textblob import Word 
from collections import Counter
import operator
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import learning_curve
import numpy as np



In [None]:
dataset_path = '/content/drive/MyDrive/imdb-reviews.csv'
testset_path = None

In [None]:
df = pandas.read_csv(dataset_path, sep='\t', engine='python')

In [None]:
df.head()

In [None]:
df.info()

Clear review comments. **Remove**:
1.   *HTML*
2.   *Numbers*
3.   *Punctuation*
4.   *Uppercase*
5.   *Stopwords*
6.   *Lemmatization*



In [None]:
# HTML

def remove_html(text):
  return  re.sub('<.*?>', '', text)
  
def clean_html(df):
  df['review'] = df['review'].apply(remove_html)
  print(df.head())
  return df

In [None]:
# Numbers

def clean_numbers(df):
  df['review'] = df['review'].str.replace(r'\d+', '', regex=True)
  print(df.head())
  return df


In [None]:
# Punctuation

def clean_punctuation(df):
  df['review'] = df['review'].str.replace(r'[^\w\s]+', '', regex=True)
  print(df.head())
  return df

In [None]:
# Uppercase

def clean_uppercase(df):
  df['review'] = df['review'].str.lower()
  print(df.head())
  return df

Before we perform the other removals we need to tokenize the words

In [None]:
# Tokenization

def clean_tokenize(df):
  df['review'] = df['review'].apply(word_tokenize)
  print(df.head())
  return df

In [None]:
# Stopwords

pattern = stopwords.words('english')

def clean_stopwords(df):
  df['review'] = df['review'].apply(lambda words: [w for w in words if w not in pattern])
  print(df.head())
  return df

In [None]:
# Lemmatization

lemmatizer = nltk.stem.WordNetLemmatizer()

# def lemmatize(text):
#   return [lemmatizer.lemmatize(w) for w in word_tokenizer.tokenize(text)]

def clean_lemmatize(df):
  df['review'] = df['review'].apply(lambda word: [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(w, 'n'), 'a'), 'v'), 'r'), 's') for w in word])
  print(df.head())
  return df
# df['review'] = df['review'].apply(lambda words: " ".join([Word(x).lemmatize() for x in words]))



Now we transform rating to 0 for negative (values in range [0, 4.0]) and 1 for positive (values in range [7.0, 10.0])

In [None]:
def clean_scale(df):
  df['rating'] = df['rating'].apply(lambda x: 0 if x <= 4.0 else 1)
  print(df.head())
  return df

Now we remove some rare words:

In [None]:
# rarewords

def clean_rarewords(df):
  temp = df['review'].apply(lambda l: [item for item in l])
  flat_list = [item for sublist in temp for item in sublist]
  counter_list = Counter(flat_list).most_common()
  final_list_desc = counter_list[:10]
  final_list_asc  = counter_list[-10:]
  only_first = [x for x,y in final_list_desc]
  only_last  = [x for x,y in final_list_asc]

  df['review'] = df['review'].apply(lambda words: [x for x in words if (x not in only_first) and (x not in only_last)])
  print(df.head())
  return df

In [None]:
def transform(df):
  df = clean_html(df)
  df = clean_numbers(df)
  df = clean_punctuation(df)
  df = clean_uppercase(df)
  df = clean_tokenize(df)
  df = clean_stopwords(df)
  df = clean_lemmatize(df)
  df = clean_scale(df)
  df = clean_rarewords(df)
  return df

In [None]:
df = transform(df)

The following experiment was added here after all the experiments below.



In [None]:
features = 64

while features <= 16500:
  X = df['review'].apply(lambda x: ' '.join(x))
  Y = df['rating']

  tfidf = TfidfVectorizer(max_features=features)
  X = tfidf.fit_transform(X)

  train_sizes, train_scores, validation_scores = learning_curve(estimator = 
  LogisticRegression(solver='newton-cg', penalty='l2', C=3, max_iter=100), X = X, y = Y, train_sizes = np.linspace(0.0005, 0.999, 10), cv = 10)

  print("Number of features: ", features)
  train_scores_mean = train_scores.mean(axis=1)
  validation_scores_mean = validation_scores.mean(axis=1)

  plt.plot(train_sizes, train_scores_mean, label = 'Training score')
  plt.plot(train_sizes, validation_scores_mean, label = 'Validation score')

  plt.ylabel('Score')
  plt.xlabel('Training size')
  plt.legend()
  plt.show()

  features *= 2


In [None]:
positive_reviews = df[df['rating'] == 0]
negative_reviews = df[df['rating'] == 1]
labels = ['Positive', 'Negative']

sizes = [positive_reviews['rating'].count(), negative_reviews['rating'].count()]
colors = ["crimson", "lightsteelblue"]

explode = (0, 0.1)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.3f%%',
        shadow=True, startangle=45)
ax1.axis('equal')
plt.show()




Now we will start training

In [None]:
# Logistic regression

def do_Logistic_Regression(X_train, y_train, X_test, y_test, max_iterations):

  LR = LogisticRegression(max_iter=max_iterations)
  scores = cross_val_score(LR, X_train, y_train, cv=10, n_jobs=-1)
  scores_mean = scores.mean()
  print(scores_mean)

  LR.fit(X_train, y_train)
  y_test_predict = LR.predict(X_test)

  print(classification_report(y_test, y_test_predict))

  f1 = f1_score(y_test, y_test_predict)  
  return f1, scores_mean

In [None]:
# TFIDF

def do_TFIDF(train_percent, features):
  X = df['review'].apply(lambda x: ' '.join(x))
  Y = df['rating']

  tfidf = TfidfVectorizer(max_features=features)
  X = tfidf.fit_transform(X)
  X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = train_percent, random_state = 13, stratify = Y)
  
  f1, scores_mean = do_Logistic_Regression(X_train, y_train, X_test, y_test, 400)

  return f1, scores_mean

In [None]:
# Count Vectorizer

def do_Count_Vectorizer(train_percent, features):
  vectorizer = CountVectorizer(max_features=features, ngram_range=(1,3))

  X = df['review'].apply(lambda x: ' '.join(x))
  X = vectorizer.fit_transform(X)

  Y = df['rating']
  X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = train_percent, random_state = 13, stratify = Y)

  f1, scores_mean = do_Logistic_Regression(X_train, y_train, X_test, y_test, 400)
  return f1, scores_mean

In [None]:
# Hashing Vectorizer

def do_Hashing_Vectorizer(train_percent, features):
  vectorizer = HashingVectorizer(n_features=features)
  X = df['review'].apply(lambda x: ' '.join(x))
  X = vectorizer.fit_transform(X)

  Y = df['rating']

  X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = train_percent, random_state = 13, stratify = Y)

  f1, scores_mean = do_Logistic_Regression(X_train, y_train, X_test, y_test, 400)
  return f1, scores_mean

We will test these three algorithms with different train sizes:

In [None]:
# Testing TFIDF
train_percent = 0.2

xs = []
scores = []
f1s = []

while train_percent <= 0.99:
  f1, mean_score = do_TFIDF(train_percent, 40000)
  
  xs.append(train_percent)
  scores.append(mean_score)
  f1s.append(f1)

  train_percent += 0.15

In [None]:
print(xs)
print(scores)
plt.plot(xs, scores, marker="x")
plt.plot(xs, f1s, marker="*")
plt.legend(["Mean score", "F1 score"])
plt.xlabel("train percent")
plt.ylabel("score")
plt.show()

In [None]:
# Testing Count Vectorizer

train_percent = 0.2

xs = []
scores = []
f1s = []

while train_percent <= 0.99:
  f1, mean_score = do_Count_Vectorizer(train_percent, 40000)
  
  xs.append(train_percent)
  scores.append(mean_score)
  f1s.append(f1)
  
  train_percent += 0.15

In [None]:
print(xs)
print(scores)
plt.plot(xs, scores, marker="x")
plt.plot(xs, f1s, marker="*")
plt.legend(["Mean score", "F1 score"])
plt.xlabel("train percent")
plt.ylabel("score")
plt.show()

In [None]:
# Testing Hashing Vectorizer

train_percent = 0.2

xs = []
scores = []
f1s = []

while train_percent <= 0.99:
  f1, mean_score = do_Hashing_Vectorizer(train_percent, 40000)
  
  xs.append(train_percent)
  scores.append(mean_score)
  f1s.append(f1)
  
  train_percent += 0.15

In [None]:
print(xs)
print(scores)
plt.plot(xs, scores, marker="x")
plt.plot(xs, f1s, marker="*")
plt.legend(["Mean score", "F1 score"])
plt.xlabel("train percent")
plt.ylabel("score")
plt.show()

We will test the algorithms with different max features values.
We will use powers of 2 for possible values.

In [None]:
# Testing TFIDF


features=1024

xs = []
scores = []
f1s = []

while features <= 40000:
  f1, mean_score = do_TFIDF(0.80, features)
  
  xs.append(features)
  scores.append(mean_score)
  f1s.append(f1)
  
  features *= 2

In [None]:
print(xs)
print(scores)
plt.plot(xs, scores, marker="x")
plt.plot(xs, f1s, marker="*")
plt.legend(["Mean score", "F1 score"])
plt.xlabel("features")
plt.ylabel("score")
plt.show()

In [None]:
# Testing Count Vectorizer

features=1024

xs = []
scores = []
f1s = []

while features <= 40000:
  f1, mean_score = do_Count_Vectorizer(0.80, features)
  
  xs.append(features)
  scores.append(mean_score)
  f1s.append(f1)
  
  features *= 2

In [None]:
print(xs)
print(scores)
plt.plot(xs, scores, marker="x")
plt.plot(xs, f1s, marker="*")
plt.legend(["Mean score", "F1 score"])
plt.xlabel("features")
plt.ylabel("score")
plt.show()

In [None]:
# Testing Hashing Vectorizer

features=1024

xs = []
scores = []
f1s = []

while features <= 40000:
  f1, mean_score = do_Hashing_Vectorizer(0.80, features)
  
  xs.append(features)
  scores.append(mean_score)
  f1s.append(f1)
  
  features *= 2

In [None]:
print(xs)
print(scores)
plt.plot(xs, scores, marker="x")
plt.plot(xs, f1s, marker="*")
plt.legend(["Mean score", "F1 score"])
plt.xlabel("features")
plt.ylabel("score")
plt.show()

In [None]:
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1', 'l2']
c_values = [7, 5, 3, 0.1, 0.01]
m_iter = [100, 1000]
grid = dict(solver=solvers,penalty=penalty,C=c_values, max_iter=m_iter)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='f1', error_score=0)

X = df['review'].apply(lambda x: ' '.join(x))
Y = df['rating']

tfidf = TfidfVectorizer(max_features=16500)
X = tfidf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = 0.80, random_state = 13, stratify = Y)

grid_result = grid_search.fit(X_train, y_train)


In [None]:
print("Best result: %f using parameters: %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

From now on we will use train_percent = 0.80 (Pareto principle).

For our vectorizer we will use TFIDF (proved to be better than the three different methods).

For TFIDF we will use max_features = 16500 (close to best).

We will use LogisticRegression with parameters: {'C': 3, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}

In [None]:
# Confusion matrix

X = df['review'].apply(lambda x: ' '.join(x))
Y = df['rating']

tfidf = TfidfVectorizer(max_features=16500)
X = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.80, random_state=13, stratify=Y)

LR = LogisticRegression(solver='newton-cg', penalty='l2', C=3, max_iter=100)
LR.fit(X_train, y_train)
y_test_predict = LR.predict(X_test)

confusion_matrix = confusion_matrix(y_test, y_test_predict)

In [None]:
print(confusion_matrix)

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(confusion_matrix)
fig.colorbar(cax)
plt.show()

*   True positive: 3977
*   False positive: 523
*   True negative: 4051
*   False negative: 451









In [None]:
train_sizes, train_scores, validation_scores = learning_curve(estimator = 
LogisticRegression(solver='newton-cg', penalty='l2', C=3, max_iter=100), X = X, y = Y, train_sizes = np.linspace(0.0005, 0.999, 20), cv = 10)

print('Training scores:\n\n', train_scores)

In [None]:
train_scores_mean = train_scores.mean(axis=1)
validation_scores_mean = validation_scores.mean(axis=1)

plt.plot(train_sizes, train_scores_mean, label = 'Training score')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation score')

plt.ylabel('Score')
plt.xlabel('Training size')
plt.legend()
plt.show()

In [None]:
if testset_path != None:
  # Training

  df = pandas.read_csv(dataset_path, sep='\t', engine='python')
  df = transform(df)
  X = df['review'].apply(lambda x: ' '.join(x))
  Y = df['rating']

  tfidf = TfidfVectorizer(max_features=16500)
  X = tfidf.fit_transform(X)

  LR = LogisticRegression(solver='newton-cg', penalty='l2', C=3, max_iter=100)
  LR.fit(X, Y)

  # Testing
  df_test = pandas.read_csv(testset_path, sep='\t', engine='python')
  df_test = transform(df_test)
  X_test = df_test['review'].apply(lambda x: ' '.join(x))
  Y_test = df_test['rating']

  tfidf = TfidfVectorizer(max_features=16500)
  X_test = tfidf.fit_transform(X_test)

  test_predict = LR.predict(X_test)
  print(classification_report(Y_test, test_predict))
