# Importing the necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Loading the Module Outlines CSV file

In [None]:
# Load the CSV file
df = pd.read_csv('Module Outlines.csv')

In [None]:
df.head()

Unnamed: 0,Year,Semester,Code,Module Title,Module Description
0,1.0,1.0,IT1030,MATHEMATICS FOR COMPUTING,"Logic Control, Number Systems, Differentiation..."
1,1.0,1.0,IT1010,Introduction to Programming,Introducion to fundamental programming concept...
2,1.0,1.0,IT1040,Communication Skills,"Speech test, spot tests and a midterm examinat..."
3,1.0,1.0,IT1020,Introduction to Computer Systems,Essentials of computer systems and computer ne...
4,1.0,2.0,IT1080,English for Academic Purposes,Necessary English language skills and all the ...


# Define the desired n-gram range

In [None]:
# Define the desired n-gram range
n = 3  # Change this value for different n-gram lengths

# Downloading the NLTK libraries

In [None]:
import nltk
nltk.download('punkt')

import nltk
nltk.download('stopwords')

nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Extract a specific column by column name

In [None]:
# Extract a specific column by column name
module_desc_column = df['Module Description']

# Print the extracted column data
print(module_desc_column)

0      Logic Control, Number Systems, Differentiation...
1      Introducion to fundamental programming concept...
2      Speech test, spot tests and a midterm examinat...
3      Essentials of computer systems and computer ne...
4      Necessary English language skills and all the ...
                             ...                        
151    This module exposes future IT professionals to...
152    The objective of the module is to provide an o...
153    This module aims to develop skills of designin...
154    This course will cover the necessary basics of...
155    This module helps students to develop an under...
Name: Module Description, Length: 156, dtype: object


# Preprocess the module description column

In [None]:
# Create a list to store the words in each row
words_per_row = []

# Iterate over the rows and split the module description into words
for desc in module_desc_column:
    if isinstance(desc, str):
        words = desc.lower().split()  # Split the description into lowercase words

        # Remove stop words
        stop_words = set(stopwords.words("english"))
        words = [word for word in words if word not in stop_words]
        
          # Remove unwanted characters using regular expressions
        words = [re.sub(r"[^a-zA-Z0-9]", "", word) for word in words]
        
        # Remove empty strings
        words = [word for word in words if word]
        
        words_per_row.append(words)  # Add the words to the list

# Print the lists of words per row
for words in words_per_row:
    print(words)


['logic', 'control', 'number', 'systems', 'differentiation', 'integration', 'functions', 'counting', 'graph', 'theory', 'matrices', 'finitestate', 'machines']
['introducion', 'fundamental', 'programming', 'concepts', 'specifically', 'procedural', 'programming', 'paradigm', 'topics', 'include', 'data', 'types', 'control', 'structures', 'functions', 'pointers', 'arrays', 'files', 'recursion', 'mechanics', 'testing', 'debugging', 'students', 'also', 'get', 'hands', 'experience', 'develop', 'applications', 'using', 'c', 'language', 'linux', 'operation', 'system']
['speech', 'test', 'spot', 'tests', 'midterm', 'examination', 'final', 'examination', 'comprehensive', 'exam', 'based', 'topics', 'covered', 'semester']
['essentials', 'computer', 'systems', 'computer', 'networks', 'order', 'prepare', 'students', 'advanced', 'courses', 'covers', 'fundamentals', 'computer', 'organization', 'combinational', 'sequential', 'logic', 'circuits', 'data', 'communication', 'computer', 'networks', 'end', 'c

In [None]:
# Create a WordNet lemmatizer object
lemmatizer = WordNetLemmatizer()

# Create a list to store the lemmatized words in each row
lemmatized_words_per_row = []

# Iterate over the rows and lemmatize the words
for words in words_per_row:
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_words_per_row.append(lemmatized_words)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams

# Getting the Top words and the keyowrds using n-grams

In [None]:
# Join the lemmatized words per row into sentences
sentences = [' '.join(words) for words in lemmatized_words_per_row]

# Create a TF-IDF vectorizer object
vectorizer = TfidfVectorizer()

# Fit and transform the sentences using TF-IDF vectorizer
tfidf_matrix = vectorizer.fit_transform(sentences)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out() 

# Print the most important words for each row
for i, row in enumerate(tfidf_matrix):
    row_data = row.toarray()[0]
    sorted_indices = row_data.argsort()[::-1]
    top_words = [feature_names[idx] for idx in sorted_indices[:10]]  # Change 10 to desired number of top words
    print(f"Row {i+1} - Top Words: {top_words}")

# Extract n-grams from the sentences
ngram_vectorizer = TfidfVectorizer(ngram_range=(2, 3))  # Change the n-gram range as desired
ngram_matrix = ngram_vectorizer.fit_transform(sentences)

# Get the n-gram feature names
ngram_feature_names = ngram_vectorizer.get_feature_names_out()

# Print the most important n-grams for each row
for i, row in enumerate(ngram_matrix):
    row_data = row.toarray()[0]
    sorted_indices = row_data.argsort()[::-1]
    top_ngrams = [ngram_feature_names[idx] for idx in sorted_indices[:10]]  # Change 10 to desired number of top n-grams
    print(f"Row {i+1} - Top N-grams: {top_ngrams}")


Row 1 - Top Words: ['matrix', 'counting', 'differentiation', 'finitestate', 'number', 'graph', 'machine', 'function', 'logic', 'integration']
Row 2 - Top Words: ['programming', 'mechanic', 'pointer', 'linux', 'procedural', 'debugging', 'introducion', 'array', 'specifically', 'get']
Row 3 - Top Words: ['examination', 'test', 'midterm', 'speech', 'spot', 'exam', 'semester', 'final', 'comprehensive', 'covered']
Row 4 - Top Words: ['computer', 'communication', 'network', 'course', 'organized', 'sequential', 'circuit', 'combinational', 'layered', 'end']
Row 5 - Top Words: ['university', 'require', 'english', 'pursue', 'academic', 'necessary', 'language', 'study', 'skill', 'practice']
Row 6 - Top Words: ['oriented', 'object', 'solution', 'given', 'class', 'design', 'identifying', 'relationship', 'implement', 'language']
Row 7 - Top Words: ['software', 'engineering', 'verification', 'artifact', 'produce', 'validation', 'appreciate', 'softwareintensive', 'specification', 'maintain']
Row 8 - To

# Saving the keywords in a csv file

In [None]:
import csv

# Print the most important n-grams for each module title
with open('module Keywords.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Module Title', 'Top N-grams'])  # Write header row

    module_titles = df['Module Title']

    for i, row in enumerate(ngram_matrix):
        row_data = row.toarray()[0]
        sorted_indices = row_data.argsort()[::-1]
        top_ngrams = [ngram_feature_names[idx] for idx in sorted_indices[:10]]  # Change 10 to desired number of top n-grams
        writer.writerow([module_titles[i], ', '.join(top_ngrams)])  # Write module title and top n-grams

print("Top n-grams saved to Module Keywords file.")



Top n-grams saved to Module Keywords file.
