In [3]:
import pandas as pd
import nltk
from keybert import KeyBERT
import string
from nltk.corpus import stopwords
import numpy as np
import matplotlib as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori,association_rules

In [5]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Load data

In [30]:
data = pd.read_csv('./UoY.csv')

### Department statistics

In [None]:
department_list = data['Department'].unique()
department_avg = {}
for department in department_list:
    department_avg[department] = len(data[data['Department'] == department])
department_avg

### combine columns

In [43]:
data = data.replace('\n',' ', regex=True)
data = data.replace('\r',' ', regex=True)
data = data.replace('  ',' ', regex=True)
# TODO: add prerequisite
data['combined'] = data['Outcome'].astype(str) + ' ' + data['Objective'].astype(str) + ' ' + data['Description'].astype(str)
# data = data[data['combined'].notna()]

In [53]:
len(data[data['combined'].isna()])

0

### Preprocess

In [54]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
words_set = set()
data['preprocessed'] = ""
data['keywords'] = ""
total_rows = data.shape[0]

for index, row in data.iterrows():
    print('Lesson {} from {} preprocessed'.format(index, total_rows))
    combined = row['combined']
    combined = combined.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    tokenized_doc = word_tokenize(combined)
    tokenized_doc = [word for word in tokenized_doc if word not in stopwords.words('english')]
    lemmatized_list = []
    for token in tokenized_doc:
        stemmed_token = stemmer.stem(token)
        lemmatized_token = lemmatizer.lemmatize(stemmed_token)
        lemmatized_list.append(lemmatized_token)
        words_set.add(lemmatized_token)

    data.at[index, 'preprocessed'] = ' '.join(map(str, lemmatized_list))

data.to_csv('./preprocess.csv')

Lesson 0 from 2925 preprocessed
Lesson 1 from 2925 preprocessed
Lesson 2 from 2925 preprocessed
Lesson 3 from 2925 preprocessed
Lesson 4 from 2925 preprocessed
Lesson 5 from 2925 preprocessed
Lesson 6 from 2925 preprocessed
Lesson 7 from 2925 preprocessed
Lesson 8 from 2925 preprocessed
Lesson 9 from 2925 preprocessed
Lesson 10 from 2925 preprocessed
Lesson 11 from 2925 preprocessed
Lesson 12 from 2925 preprocessed
Lesson 13 from 2925 preprocessed
Lesson 14 from 2925 preprocessed
Lesson 15 from 2925 preprocessed
Lesson 16 from 2925 preprocessed
Lesson 17 from 2925 preprocessed
Lesson 18 from 2925 preprocessed
Lesson 19 from 2925 preprocessed
Lesson 20 from 2925 preprocessed
Lesson 21 from 2925 preprocessed
Lesson 22 from 2925 preprocessed
Lesson 23 from 2925 preprocessed
Lesson 24 from 2925 preprocessed
Lesson 25 from 2925 preprocessed
Lesson 26 from 2925 preprocessed
Lesson 27 from 2925 preprocessed
Lesson 28 from 2925 preprocessed
Lesson 29 from 2925 preprocessed
Lesson 30 from 2925 

### keyword extraction

In [57]:
kw_model = KeyBERT()
for index, row in data.iterrows():
    print('Lesson {} from {} keywords extracted'.format(index, total_rows))
    keywords = kw_model.extract_keywords(data.at[index, 'preprocessed'])
    data.at[index, 'keywords'] = keywords
data.to_csv('./with_keywords.csv')

Lesson 0 from 2925 keywords extracted
Lesson 1 from 2925 keywords extracted
Lesson 2 from 2925 keywords extracted
Lesson 3 from 2925 keywords extracted
Lesson 4 from 2925 keywords extracted
Lesson 5 from 2925 keywords extracted
Lesson 6 from 2925 keywords extracted
Lesson 7 from 2925 keywords extracted
Lesson 8 from 2925 keywords extracted
Lesson 9 from 2925 keywords extracted
Lesson 10 from 2925 keywords extracted
Lesson 11 from 2925 keywords extracted
Lesson 12 from 2925 keywords extracted
Lesson 13 from 2925 keywords extracted
Lesson 14 from 2925 keywords extracted
Lesson 15 from 2925 keywords extracted
Lesson 16 from 2925 keywords extracted
Lesson 17 from 2925 keywords extracted
Lesson 18 from 2925 keywords extracted
Lesson 19 from 2925 keywords extracted
Lesson 20 from 2925 keywords extracted
Lesson 21 from 2925 keywords extracted
Lesson 22 from 2925 keywords extracted
Lesson 23 from 2925 keywords extracted
Lesson 24 from 2925 keywords extracted
Lesson 25 from 2925 keywords extrac

### create keywords dataframe

In [61]:
df = list()
for index, row in data.iterrows():
    keywords = row['keywords']
    if keywords is str:
        keywords = eval(keywords)
        keywords = [k[0] for k in keywords]
    df.append(keywords)
    
te = TransactionEncoder()
te_ary = te.fit(df).transform(df)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.to_csv('./keywords.csv', index=False)

### find frequent patterns

In [63]:
x = apriori(df, min_support=0.01, use_colnames=True)
y = association_rules(x, min_threshold=0.1)

print(y)
y.to_csv('./rules.csv')

                                          antecedents  \
0                                  ((current, 0.344))   
1                                   ((inform, 0.565))   
2                                     ((nan, 0.3874))   
3                                  ((current, 0.344))   
4                                 ((unavail, 0.5579))   
5                                  ((current, 0.344))   
6                                     ((nan, 0.3874))   
7                                   ((inform, 0.565))   
8                                 ((unavail, 0.5579))   
9                                   ((inform, 0.565))   
10                                    ((nan, 0.3874))   
11                                ((unavail, 0.5579))   
12                  ((nan, 0.3874), (current, 0.344))   
13                   ((nan, 0.3874), (inform, 0.565))   
14                ((current, 0.344), (inform, 0.565))   
15                                    ((nan, 0.3874))   
16                             