In [None]:
cd ..

# FileUtil Initialization

In [None]:
from src.utils.file_util import FileUtil

file_util = FileUtil()

# Retrieve Raw Train Data

In [None]:
train = file_util.get_raw_train_data()

In [None]:
train.head()

# Preprocessing

In [None]:
from src.preprocessing.transformations import apply_cleaning_train

In [None]:
train = apply_cleaning_train(train)

In [None]:
train.head()

# Training

## Sentiment Analysis

In [None]:
from src.models.sentiment_analysis.train.train import sentiment_analysis_train

In [None]:
sentiment_analysis_train()

## Topic Modelling

In [None]:
from src.models.topic_modelling.train.train import topic_modelling_train

In [None]:
topic_modelling_train()

# Evaluation

## Sentiment Analysis

In [None]:
metrics = file_util.get_metrics("sentiment_analysis")
metrics

In [None]:
models_prauc = sorted(list(map(lambda item: (item[0], item[1]["PR AUC"]), metrics.items())), key = lambda x: x[1])
print("Best model is {} with PR-AUC {}".format(models_prauc[-1][0], models_prauc[-1][1]))

## Topic Modelling

In [None]:
fig = file_util.get_topics_html("LDA")
fig.update_layout(width = 700, height = 800)

In [None]:
fig = file_util.get_topics_html("BERTopic")
fig.update_layout(width = 700, height = 1000)

In [None]:
fig = file_util.get_topics_html("NMF")
fig.update_layout(width = 700, height = 550)

# Predict reviews_test.csv

In [None]:
from src.models.predict import predict_sentiment_topic

In [None]:
file_util.best_sentiment_analysis_model

In [None]:
file_util.TEST_FILE_NAME

In [None]:
test_bert = predict_sentiment_topic()
test_bert.head()

In [None]:
test_bert.drop(["cleaned_text", "subtopic", "topic"], axis = 1, inplace = True)
test_bert.rename(columns = {"partially_cleaned_text": "Text", "date": "Time", "sentiment": "predicted_sentiment", 
                            "sentiment_prob": "predicted_sentiment_prob"}, inplace = True)

In [None]:
test_bert.head()

In [None]:
test_bert.to_csv("final_presentation/reviews_test_predictions_h2o2.ai.csv")

## Extra (Prediction with Other Models)

In [None]:
file_util.best_sentiment_analysis_model = "Logistic Regression"
test_logreg = predict_sentiment_topic()
test_logreg.head()

In [None]:
file_util.best_sentiment_analysis_model = "LSTM"
test_lstm = predict_sentiment_topic()
test_lstm.head()

## Visualizations

In [None]:
import pandas as pd
from src.visualisation.dashboard_viz import *

vis_df = reformat_data(test_bert)
pio.renderers.default = "svg"

### Visualizations for sentiments

In [None]:
# need Weiqing or Madeline help

sentiment_pie_chart_fig = sentiment_pie_chart(vis_df)
sentiment_trend_fig = sentiment_line_chart_over_time(vis_df)
topics_sentiment_fig = topics_bar_chart(vis_df)

display(sentiment_pie_chart_fig.update_layout(width = 500, height = 300, title='Overall Sentiment Breakdown'))
display(sentiment_trend_fig.update_layout(title='Sentiment trend'))
display(topics_sentiment_fig.update_layout(title='Topics by Sentiment'))

### Visualizations for topics

In [None]:
topics_pie_chart_fig = topics_pie_chart(vis_df)
topics_bar_chart_fig = topics_bar_chart_over_time(vis_df, time_frame='Q')
top_key_words_fig = visualise_all_topics(vis_df)

display(topics_pie_chart_fig.update_layout(width = 500, height = 300, title='Frequency of topics'))
display(topics_bar_chart_fig.update_layout(title='Topics over Time'))
display(top_key_words_fig)

### Visualizations for specific topic

In [None]:
# Subtopics in each topic
select_topic = 'Drinks'

subtopic_fig = get_subtopics(vis_df, topic=select_topic)
subtopic_sentiment_fig = sentiment_pie_chart(vis_df[vis_df["topic"]==select_topic])

display(subtopic_sentiment_fig.update_layout(width = 500, height = 300,  title=f'Sentiment Breakdown for {select_topic}'))
display(subtopic_fig.update_layout(width = 500, height = 300))

# Unit Testing

In [1]:
cd ..

c:\Users\clift\github\h2o2.ai


In [2]:
import src.unittest.unit_testing
from src.unittest.unit_testing import unit_test

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\clift\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
print("Methods in unit testing:", [method for method in dir(src.unittest.unit_testing) if method[:4] == "test"])

In [3]:
unit_test()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at c:\Users\clift\github\h2o2.ai\src/models/sentiment_analysis\train\bert_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.
Some layers from the model checkpoint at c:\Users\clift\github\h2o2.ai\src/models/sentiment_analysis\train\bert_model were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that

# Modular Code

In [None]:
import os

def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        if os.path.basename(root) == "__pycache__":
            continue
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))

In [None]:
list_files("src")

# OOP

In [None]:
from src.utils.file_util import FileUtil
print("Methods in FileUtil:", [func for func in dir(FileUtil) if callable(getattr(FileUtil, func)) and not func.startswith("__")])
print("Attributes in FileUtil:", list(FileUtil().__dict__.keys()))

In [None]:
from src.models.sentiment_analysis.train.bert import BERT
print("Methods in BERT:", [func for func in dir(BERT) if callable(getattr(BERT, func)) and not func.startswith("__")])
print("Attributes in BERT:", list(BERT().__dict__.keys()))

In [None]:
from src.models.topic_modelling.train.lda import LDA
print("Methods in LDA:", [func for func in dir(LDA) if callable(getattr(LDA, func)) and not func.startswith("__")])
print("Attributes in LDA:", list(LDA().__dict__.keys()))

In [None]:
from src.models.topic_modelling.train.bertopic import BERTopic_Module
print("Methods in BERTopic_Module:", [func for func in dir(BERTopic_Module) if callable(getattr(BERTopic_Module, func)) and not func.startswith("__")])
print("Attributes in BERTopic_Module:", list(BERTopic_Module().__dict__.keys()))

# Docstrings Examples

In [None]:
help(FileUtil.put_csv)

In [None]:
print(predict_sentiment_topic.__doc__)

In [None]:
help(sentiment_analysis_train)

In [None]:
help(topic_modelling_train)