<a href="https://colab.research.google.com/github/redsprites/A6_MongoDB/blob/main/AIProj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Project Setup

In [None]:
!pip install pandas numpy transformers

Looking in indexes: https://download.pytorch.org/whl/cu118


In [None]:
!pip install ktrain

Collecting ktrain
  Downloading ktrain-0.39.0.tar.gz (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langdetect (from ktrain)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting syntok>1.3.3 (from ktrain)
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting tika (from ktrain)
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from ktrain)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting k

In [None]:
!pip install tensorflow



In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
from ktrain import text
import ktrain
from sklearn.model_selection import train_test_split

In [None]:
science_professors_url = "https://raw.githubusercontent.com/ssdtac/Professor-Reviews/master/science_professors_v2.json"
humanities_professors_url = "https://raw.githubusercontent.com/ssdtac/Professor-Reviews/master/humanities_professors_v2.json"

## Preprocess Data
First, clean the text data, then split the data, then feed it into a Keras training model

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import re
from nltk.stem import WordNetLemmatizer
import requests
from textblob import TextBlob

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'^b\s+', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    lemmatized_tokens = [token for token in lemmatized_tokens if len(token) > 3]

    return ' '.join(lemmatized_tokens)

def preprocess_ratings(num):
    return int(num - 1)

def load_and_preprocess_data(url):
    data = pd.read_json(url)
    original_row_count = len(data)
    print(f"Rows before preprocessing: {original_row_count}")

    # Remove rows with 'No Comments'
    data = data[data['comment'].ne('No Comments')]

    preprocessed_row_count = len(data)
    print(f"Rows discarded for being 'No Comments': {original_row_count - preprocessed_row_count}")

    # Apply text preprocessing to 'comment' column
    data['comment'] = data['comment'].apply(preprocess_text)

    # Remove rows where 'comment' is NaN or empty after preprocessing
    data = data[data['comment'].notna() & data['comment'].str.strip().ne('')]

    filtered_row_count = len(data)
    print(f"Rows discarded after preprocessing: {preprocessed_row_count - filtered_row_count}")

    # Filtering for valid 'qualityRating' and 'difficultyRating'
    data = data[data['qualityRating'].between(1, 5, inclusive='both') & data['difficultyRating'].between(1, 5, inclusive='both')]
    data['qualityRating'] = data['qualityRating'].apply(preprocess_ratings)
    # Printing final statistics
    print(f"Rows left after preprocessing: {len(data)}")

    return data


science_professors = load_and_preprocess_data(science_professors_url)
humanities_professors = load_and_preprocess_data(humanities_professors_url)

Rows before preprocessing: 6412
Rows discarded for being 'No Comments': 56
Rows discarded after preprocessing: 5
Rows left after preprocessing: 6338
Rows before preprocessing: 11389
Rows discarded for being 'No Comments': 178
Rows discarded after preprocessing: 22
Rows left after preprocessing: 11166


## Split Data

In [None]:
train_df, test_df = train_test_split(science_professors, test_size=0.2, random_state=40)

train_size = train_df.shape[0]
test_size = test_df.shape[0]

print("Size of training set:", train_size,"\n"+"Size of test set:", test_size)

x_train = train_df['comment'].to_numpy()
y_train = train_df['qualityRating'].to_numpy().astype(int)

x_test = test_df['comment'].to_numpy()
y_test = test_df['qualityRating'].to_numpy().astype(int)

Size of training set: 5070 
Size of test set: 1268


## Add base distilBERT model

In [None]:
# Create a Transformer model
t = text.Transformer('distilbert-base-uncased', maxlen=120, classes=[1,2,3,4,5])



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
#trn, val, preproc = text.texts_from_df(train_df=train_df, text_column='comment', random_state=42,
 #                                      label_columns=['qualityRating','difficultyRating'],
  #                                        val_df=test_df, lang='en',
   #                                       preprocess_mode='distilbert',
    #                                      maxlen=120, verbose=True,)

trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)

preprocessing train...
language: en
train sequence lengths:
	mean : 19
	95percentile : 30
	99percentile : 33


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 19
	95percentile : 30
	99percentile : 32


## Create Model

In [None]:
model = t.get_classifier()

## Test for different Learning Rates

In [None]:
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=64)

learner.lr_find(show_plot=True)

simulating training for different learning rates... this may take a few moments...
Epoch 1/1024

## Train Data

In [None]:
learner.autofit(lr=2e-5)

early_stopping automatically enabled at patience=5
reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 1e-07...
Epoch 1/1024

AttributeError: ignored

In [None]:
print(learner.predict())

      3/Unknown - 24s 7s/step

KeyboardInterrupt: ignored