# Labor IV. Linear Regression

<img src="https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F62413fa0-3d80-411c-af93-ebd0f096a26a_1042x644.png">

## I have an idea. I do a sentiment analysis!

<img src="https://gmu.ac.ae/wp-content/uploads/2017/03/idea.jpg">

## I need some data!

<img src="https://staging.herovired.com/wp-content/uploads/2023/04/What-Is-Data-Definition-01.webp">

## [Huggingface](https://huggingface.co/docs/datasets/index)

- IMDB dataset: hf://datasets/scikit-learn/imdb/IMDB Dataset.csv

In [None]:
import pandas as pd

imdb_dataset = pd.read_csv("hf://datasets/scikit-learn/imdb/IMDB Dataset.csv")
imdb_dataset.head()

## Text cleaning

<img src="https://www.henryford.com/-/media/project/hfhs/henryford/henry-ford-blog/images/mobile-interior-banner-images/2019/02/bucket-of-cleaning-products.jpg">

In [None]:
# Lower case

imdb_dataset["review"] = imdb_dataset["review"].apply(lambda x: x.lower())
imdb_dataset.head()

In [None]:
# Remove white spaces

from tqdm.notebook import tqdm

def remove_extra_whitespace(text):
  """Removes leading/trailing whitespace and replaces multiple spaces with single spaces."""
  text = text.strip()
  text = " ".join(text.split())
  return text

# Apply the function to the 'review' column with a progress bar
imdb_dataset['review'] = [remove_extra_whitespace(review) for review in tqdm(imdb_dataset['review'], desc="Cleaning reviews")]

In [None]:
# Remove special characters

import re

def remove_special_characters(text):
  """Removes special characters from the text."""
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
  return text

# Apply the function to the 'review' column with a progress bar
imdb_dataset['review'] = [remove_special_characters(review) for review in tqdm(imdb_dataset['review'], desc="Removing special characters")]
imdb_dataset.head()

In [None]:
from bs4 import BeautifulSoup

def remove_html_tags(text):
  """Removes HTML tags from the text."""
  soup = BeautifulSoup(text, "html.parser")
  return soup.get_text()

# Apply the function to the 'review' column with a progress bar
imdb_dataset['review'] = [remove_html_tags(review) for review in tqdm(imdb_dataset['review'], desc="Removing HTML tags")]
imdb_dataset.head()

In [None]:
!pip install contractions

In [None]:
import contractions

def expand_contractions(text):
  """Expands contractions in the text."""
  return contractions.fix(text)

# Apply the function to the 'review' column with a progress bar
imdb_dataset['review'] = [expand_contractions(review) for review in tqdm(imdb_dataset['review'], desc="Expanding contractions")]
imdb_dataset.head()

In [None]:
# Removing Punctuation

import nltk
import string

nltk.download('punkt')
from nltk.tokenize import word_tokenize

def remove_punctuation(text):
  """Removes punctuation from a string."""
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)


for i in tqdm(range(len(imdb_dataset))):
    imdb_dataset["review"][i] = remove_punctuation(imdb_dataset["review"][i])

imdb_dataset.head()

In [None]:
# Remove Numbers

def remove_numbers(text):
  """Removes numbers from a string."""
  result = ''.join([i for i in text if not i.isdigit()])
  return result

imdb_dataset["review"] = [remove_numbers(review) for review in tqdm(imdb_dataset['review'], desc="Cleaning reviews")]
imdb_dataset.head()

In [None]:
# Remove stop words

from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(text):
  """Removes stopwords from a string."""
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(text)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  return " ".join(filtered_sentence)

imdb_dataset["review"] = [remove_stopwords(review) for review in tqdm(imdb_dataset['review'], desc="Cleaning reviews")]
imdb_dataset.head()

In [None]:
# Lemmatization

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
  """Lemmatizes words in a string."""
  lemmatizer = WordNetLemmatizer()
  word_tokens = word_tokenize(text)
  lemmatized_sentence = [lemmatizer.lemmatize(w) for w in word_tokens]
  return " ".join(lemmatized_sentence)

imdb_dataset["review"] = [lemmatize_text(review) for review in tqdm(imdb_dataset['review'], desc="Cleaning reviews")]
imdb_dataset.head()

## Training, validation and test set

<img src="https://www.brainstobytes.com/content/images/2020/01/Sets.png">



In [None]:
# Labeling

imdb_dataset['sentiment'] = imdb_dataset['sentiment'].map({'positive': 1, 'negative': 0})
imdb_dataset.head()

In [None]:
# Train and test set

from sklearn.model_selection import train_test_split

# Split the data into training and temporary sets (80% train, 20% temp)
train_df, temp_df = train_test_split(imdb_dataset, test_size=0.2, random_state=42)

# Split the temporary set into validation and test sets (50% validation, 50% test)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Now you have train_df, val_df, and test_df
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

In [None]:
# CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = CountVectorizer(max_features=1000)

# Fit and transform the training data
train_df_cv = vectorizer.fit_transform(train_df['review']).toarray()

# Transform the validation and test data using the same vectorizer
val_df_cv = vectorizer.transform(val_df['review']).toarray()
test_df_cv = vectorizer.transform(test_df['review']).toarray()

In [None]:
# TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the training data
train_df_tfidf = tfidf_vectorizer.fit_transform(train_df['review'])

# Transform the validation and test data using the same vectorizer
val_df_tfidf = tfidf_vectorizer.transform(val_df['review'])
test_df_tfidf = tfidf_vectorizer.transform(test_df['review'])

In [None]:
# Labels

train_df_labels = train_df['sentiment'].values
val_df_labels = val_df['sentiment'].values
test_df_labels = test_df['sentiment'].values

## Modelling

<img src="https://images.spiceworks.com/wp-content/uploads/2022/04/11040521/46-4-e1715636469361.png">



In [None]:
# Model CV

import tensorflow as tf

# Define the model
model_cv = tf.keras.models.Sequential([
  tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(train_df_cv.shape[1],))
])

# Compile the model
model_cv.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model_cv.fit(train_df_cv, train_df_labels, epochs=5, validation_data=(val_df_cv, val_df_labels))

In [None]:
# Model CV

import tensorflow as tf

# Define the model
model_tfidf = tf.keras.models.Sequential([
  tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(train_df_cv.shape[1],))
])

# Compile the model
model_tfidf.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model_tfidf.fit(train_df_cv, train_df_labels, epochs=5, validation_data=(val_df_cv, val_df_labels))

In [None]:
# Evaluate the model on the test set
loss_cv, accuracy_cv = model_cv.evaluate(test_df_cv, test_df_labels)
print('Test accuracy (CV):', accuracy_cv)

loss_tfidf, accuracy_tfidf = model_tfidf.evaluate(test_df_tfidf, test_df_labels)
print('Test accuracy (TFIDF):', accuracy_tfidf)