<a href="https://colab.research.google.com/github/raj-vijay/ml/blob/master/02.Linear%20Classifiers/08_Logistic_regression_and_regularization_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Sentiment analysis for movie reviews**

In [None]:
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

  import pandas.util.testing as tm


**Large Movie Review Dataset**

This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. 

There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided. See the README file contained in the release for more details.

In [None]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
  
  return train_df, test_df

# Reduce logging output.
logging.set_verbosity(logging.ERROR)

train_df, test_df = download_and_load_datasets()
train_df.head()

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


Unnamed: 0,sentence,sentiment,polarity
0,This scared the hell out of me when i was a te...,4,0
1,"After watching Awake,I led to a conclusion:dir...",4,0
2,........and an extremely bad one at that!!! Ho...,1,0
3,"The Japanese ""Run Lola Run,"" his is one offbea...",9,1
4,"I am a lover of B movies, give me a geneticall...",1,0


In [None]:
train_df.shape

(25000, 3)

In [None]:
test_df.shape

(25000, 3)

In [None]:
X_train = train_df['sentence']
X_test = test_df['sentence']

In [None]:
y_train = train_df['polarity']
y_test = test_df['polarity']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
X_train = v.fit_transform(train_df['sentence'])
X_test = v.fit_transform(test_df['sentence'])

In [None]:
def get_features(review):
    return v.transform([review])

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_weak_reg = LogisticRegression(C=100)
lr_strong_reg = LogisticRegression(C=0.01)

In [None]:
lr_weak_reg.fit(X_train, y_train)
lr_strong_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
lr_weak_reg.score(X_train, y_train)

0.99996

In [None]:
lr_strong_reg.score(X_train, y_train)

0.80152