In [43]:
import pandas as pd
import nltk
from keybert import KeyBERT
import string
from nltk.corpus import stopwords
import numpy as np
import matplotlib as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori,association_rules

In [5]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Load data

In [38]:
data = pd.read_csv('./UoY.csv')

### Department statistics

In [39]:
department_list = data['Department'].unique()
department_avg = {}
for department in department_list:
    department_avg[department] = len(data[data['Department'] == department])
department_avg

{'Archaeology': 129,
 'Biology': 62,
 'Centre for 18th Century Studies': 14,
 'Centre for Lifelong Learning': 63,
 'Centre for Medieval Studies': 30,
 "Centre for Women's Studies": 11,
 'Chemistry': 61,
 'Computer Science': 97,
 'Economics and Related Studies': 117,
 'Education': 107,
 'Electronic Engineering': 122,
 'English and Related Literature': 182,
 'Environment and Geography': 88,
 'Health Sciences': 128,
 'History': 129,
 'History of Art': 73,
 'Language and Linguistic Science': 270,
 'Mathematics': 137,
 'Music': 140,
 'Natural Sciences': 5,
 'Philosophy': 123,
 'Philosophy, Politics and Economics': 11,
 'Physics': 84,
 'Politics': 121,
 'Psychology': 89,
 'Research Centre for Social Sciences': 6,
 'School of Social and Political Sciences': 7,
 'Social Policy and Social Work': 105,
 'Sociology': 65,
 'The York Law School': 93,
 'The York Management School': 140,
 'Theatre, Film, Television and Interactive Media': 116}

### combine columns

In [40]:
import re

data['combined'] = data['Outcome'].astype(str) + ' ' + data['Objective'].astype(str) + ' ' + data['Description'].astype(str)

for i, row in data.iterrows(): # add prequisites and course title
    data.at[i,'combined'] += ' ' + row['Course title'].rpartition('-')[0]
    pre = row['Prerequisite']
    if (type(pre) is str):
        data.at[i,'combined'] += ' ' + re.sub(r'\([\w]{3}[0-9]{5}[\w]\)', ' ', pre)
        
data = data.replace('\n',' ', regex=True)
data = data.replace('\r',' ', regex=True)
data = data.replace('  ',' ', regex=True)

Upon completion of this module students should be able to:
Specify an appropriate level of building recording and identify appropriate recording methods
Execute an intermediate level of measured and image-based buildings survey
Produce an intermediate drawn, photographic, and digital record of a building
Produce a drawn record to professional standards To explore how to select an appropriate level and method of survey for an historic building project
To provide practical training in buildings survey by hand, photography, and instrument
To train students in the production of a survey report to professional standards nan Applied Buildings Recording  Building Recording  
By the end of this module, students should be able to
Be aware how biomolecular methods can be applied to archaeological deposits
Assess the scope and limitations of bimolecular methods under a range of conditions
Match analytical methods and results to research aims and objectives
Allocate and co-ordinate tasks, and comm

In [41]:
# data = data[data['combined'].notna()]
len(data[data['combined'].isna()])

0

### Preprocess

In [45]:
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.PorterStemmer()
words_set = set()
data['preprocessed'] = ""
total_rows = data.shape[0]

for index, row in data.iterrows():
    combined = row['combined']
    combined = combined.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    tokenized_doc = nltk.word_tokenize(combined)
    tokenized_doc = [word for word in tokenized_doc if word not in stopwords.words('english')]
    lemmatized_list = []
    for token in tokenized_doc:
        stemmed_token = stemmer.stem(token)
        lemmatized_token = lemmatizer.lemmatize(stemmed_token)
        lemmatized_list.append(lemmatized_token)
        words_set.add(lemmatized_token)

    data.at[index, 'preprocessed'] = ' '.join(map(str, lemmatized_list))
    print('Lesson {} from {} preprocessed'.format(index, total_rows))

data.to_csv('./preprocess.csv')

Lesson 0 from 2925 preprocessed
Lesson 1 from 2925 preprocessed
Lesson 2 from 2925 preprocessed
Lesson 3 from 2925 preprocessed
Lesson 4 from 2925 preprocessed
Lesson 5 from 2925 preprocessed
Lesson 6 from 2925 preprocessed
Lesson 7 from 2925 preprocessed
Lesson 8 from 2925 preprocessed
Lesson 9 from 2925 preprocessed
Lesson 10 from 2925 preprocessed
Lesson 11 from 2925 preprocessed
Lesson 12 from 2925 preprocessed
Lesson 13 from 2925 preprocessed
Lesson 14 from 2925 preprocessed
Lesson 15 from 2925 preprocessed
Lesson 16 from 2925 preprocessed
Lesson 17 from 2925 preprocessed
Lesson 18 from 2925 preprocessed
Lesson 19 from 2925 preprocessed
Lesson 20 from 2925 preprocessed
Lesson 21 from 2925 preprocessed
Lesson 22 from 2925 preprocessed
Lesson 23 from 2925 preprocessed
Lesson 24 from 2925 preprocessed
Lesson 25 from 2925 preprocessed
Lesson 26 from 2925 preprocessed
Lesson 27 from 2925 preprocessed
Lesson 28 from 2925 preprocessed
Lesson 29 from 2925 preprocessed
Lesson 30 from 2925 

### keyword extraction

In [10]:
kw_model = KeyBERT()

In [13]:
data = pd.read_csv('./preprocess.csv')
data['keywords'] = ""
total_rows = data.shape[0]
for index, row in data.iterrows():
    keywords = kw_model.extract_keywords(data.at[index, 'preprocessed'], top_n=10, min_df=2)
    print('Lesson {} from {} keywords extracted: {}'.format(index, total_rows, len(keywords)))
    data.at[index, 'keywords'] = keywords
data.to_csv('./with_keywords.csv')

Lesson 0 from 2925 keywords extracted: 10
Lesson 1 from 2925 keywords extracted: 10
Lesson 2 from 2925 keywords extracted: 10
Lesson 3 from 2925 keywords extracted: 10
Lesson 4 from 2925 keywords extracted: 10
Lesson 5 from 2925 keywords extracted: 10
Lesson 6 from 2925 keywords extracted: 10
Lesson 7 from 2925 keywords extracted: 10
Lesson 8 from 2925 keywords extracted: 10
Lesson 9 from 2925 keywords extracted: 10
Lesson 10 from 2925 keywords extracted: 4
Lesson 11 from 2925 keywords extracted: 10
Lesson 12 from 2925 keywords extracted: 10
Lesson 13 from 2925 keywords extracted: 10
Lesson 14 from 2925 keywords extracted: 10
Lesson 15 from 2925 keywords extracted: 10
Lesson 16 from 2925 keywords extracted: 10
Lesson 17 from 2925 keywords extracted: 10
Lesson 18 from 2925 keywords extracted: 10
Lesson 19 from 2925 keywords extracted: 10
Lesson 20 from 2925 keywords extracted: 10
Lesson 21 from 2925 keywords extracted: 10
Lesson 22 from 2925 keywords extracted: 10
Lesson 23 from 2925 ke

### create keywords dataframe

In [61]:
df = list()
for index, row in data.iterrows():
    keywords = row['keywords']
    if keywords is str:
        keywords = eval(keywords)
        keywords = [k[0] for k in keywords]
    df.append(keywords)
    
te = TransactionEncoder()
te_ary = te.fit(df).transform(df)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.to_csv('./keywords.csv', index=False)

In [3]:
df = pd.read_csv('./keywords.csv')

### find frequent patterns

In [7]:
x = apriori(df, min_support=0.0001, use_colnames=True)
x.to_csv('./frequent_patterns.csv')
x.shape

MemoryError: Unable to allocate 390. GiB for an array with shape (71598561, 2, 2925) and data type bool

In [63]:
y = association_rules(x, min_threshold=0.1)
print(y)
y.to_csv('./rules.csv')

                                          antecedents  \
0                                  ((current, 0.344))   
1                                   ((inform, 0.565))   
2                                     ((nan, 0.3874))   
3                                  ((current, 0.344))   
4                                 ((unavail, 0.5579))   
5                                  ((current, 0.344))   
6                                     ((nan, 0.3874))   
7                                   ((inform, 0.565))   
8                                 ((unavail, 0.5579))   
9                                   ((inform, 0.565))   
10                                    ((nan, 0.3874))   
11                                ((unavail, 0.5579))   
12                  ((nan, 0.3874), (current, 0.344))   
13                   ((nan, 0.3874), (inform, 0.565))   
14                ((current, 0.344), (inform, 0.565))   
15                                    ((nan, 0.3874))   
16                             