### Text Sentiment Analysis

#### This is version eight 

 This script preprocesses the IMDB movie review data, trains a machine learning model, and deploys it to the web via Flask to allow users to input text and get a sentiment classification result.

Preprocessing the IMDB movie review dataset from Stanford:
+ Download the dataset from the Stanford AI Group website (http://ai.stanford.edu/~amaas/data/sentiment/)
+ Extract the files and preprocess the data by removing HTML tags, converting all text to lowercase, removing punctuation and special characters, and tokenizing the text into individual words.
+ Split the preprocessed data into training and testing sets.

Running the preprocessed data through a Naive Bayes model:
+ Train a Naive Bayes model on the preprocessed training data.
+ Use the trained model to predict the sentiment of the preprocessed testing data.
+ Evaluate the performance of the model using metrics like accuracy, precision, recall, and F1 score.

Deploying the model to the internet via Flask:
+ Create a Flask app that allows users to input a line of text to be classified.
Preprocess the user input by removing HTML tags, converting to lowercase, removing punctuation and special characters, and tokenizing the text into individual words.
+ Use the trained Naive Bayes model to classify the preprocessed user input as either positive or negative.
+ Return the classification result to the user via the Flask app.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

def load_imdb_data(path):
    data = []
    labels = []

    for sentiment in ["neg", "pos"]:
        dir_path = os.path.join(path, sentiment)
        for filename in os.listdir(dir_path):
            with open(os.path.join(dir_path, filename), "r", encoding="utf-8") as f:
                data.append(f.read())
                labels.append(1 if sentiment == "pos" else 0)

    return pd.DataFrame({"review": data, "label": labels})

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stopwords.words("english")]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

if __name__ == "__main__":
    #data_path = "aclImdb"
    data_path = "/content/drive/MyDrive/aclImdb"
    train_path = os.path.join(data_path, "train")
    test_path = os.path.join(data_path, "test")

    train_df = load_imdb_data(train_path)
    test_df = load_imdb_data(test_path)

    train_df["review"] = train_df["review"].apply(preprocess_text)
    test_df["review"] = test_df["review"].apply(preprocess_text)

    train_df.to_csv("preprocessed_train.csv", index=False)
    test_df.to_csv("preprocessed_test.csv", index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
# Copy the data to your local drive to opvoid the need to preprocessing again.
!cp /content/preprocessed_test.csv /content/drive/MyDrive/aclImdb
!cp /content/preprocessed_train.csv /content/drive/MyDrive/aclImdb

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import csv

def load_data(train_file, test_file):
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)

    return train_df, test_df

def create_vectorizer():
    return TfidfVectorizer()

def train_model(train_data, train_labels, vectorizer):
    X_train = vectorizer.fit_transform(train_data)

    model = MultinomialNB()
    param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0]}
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, train_labels)

    return grid_search.best_estimator_

def evaluate_model(model, test_data, test_labels, vectorizer):
    X_test = vectorizer.transform(test_data)

    predictions = model.predict(X_test)
    print(classification_report(test_labels, predictions))

    return predictions

def save_predictions(predictions, output_file):
    with open(output_file, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["prediction"])
        for pred in predictions:
            writer.writerow([pred])

if __name__ == "__main__":
    train_file = "preprocessed_train.csv"
    test_file = "preprocessed_test.csv"
    predictions_file = "predictions.csv"

    train_df, test_df = load_data(train_file, test_file)

    vectorizer = create_vectorizer()
    model = train_model(train_df["review"], train_df["label"], vectorizer)

    predictions = evaluate_model(model, test_df["review"], test_df["label"], vectorizer)
    save_predictions(predictions, predictions_file)


              precision    recall  f1-score   support

           0       0.81      0.89      0.84     12500
           1       0.88      0.79      0.83     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



In [16]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import csv

def load_data(train_file, test_file):
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)

    return train_df, test_df

def create_vectorizer():
    return TfidfVectorizer()

def train_model(train_data, train_labels, vectorizer):
    X_train = vectorizer.fit_transform(train_data)

    model = MultinomialNB()
    param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0]}
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, train_labels)

    return grid_search.best_estimator_

def evaluate_model(model, test_data, test_labels, vectorizer):
    X_test = vectorizer.transform(test_data)

    predictions = model.predict(X_test)
    print(classification_report(test_labels, predictions))

    return predictions

def save_predictions(predictions, output_file):
    with open(output_file, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["prediction"])
        for pred in predictions:
            writer.writerow([pred])

if __name__ == "__main__":
    train_file = "preprocessed_train.csv"
    test_file = "preprocessed_test.csv"
    predictions_file = "predictions.csv"

    train_df, test_df = load_data(train_file, test_file)

    vectorizer = create_vectorizer()
    model = train_model(train_df["review"], train_df["label"], vectorizer)

    # Save the trained model and vectorizer
    with open("model.pkl", "wb") as model_file:
        pickle.dump(model, model_file)

    with open("vectorizer.pkl", "wb") as vectorizer_file:
        pickle.dump(vectorizer, vectorizer_file)

    predictions = evaluate_model(model, test_df["review"], test_df["label"], vectorizer)
    save_predictions(predictions, predictions_file)


              precision    recall  f1-score   support

           0       0.81      0.89      0.84     12500
           1       0.88      0.79      0.83     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



In [17]:
# Copy the data to your local drive to opvoid the need to run the model again.
!cp /content/model.pkl /content/drive/MyDrive/aclImdb
!cp /content/vectorizer.pkl /content/drive/MyDrive/aclImdb