In [None]:
# Uncomment the following statements the first time
# if you do not have the libraries installed

#!pip install nltk
#!pip install gensim

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score

In [6]:
# Initialize variables
file_name = '../../source/Session-Summary-sample.xlsx'

# Load the data
df_orig = pd.read_excel(file_name)
df_work = df_orig.copy()

# Convert RollNo to uppercase and Session_Summary to lowercase
df_work['RollNo'] = df_work['RollNo'].astype(str).str.upper()
df_work['Session_Summary'] = df_work['Session_Summary'].str.lower()

In [None]:
# Add columns for total number of characters and words in Session_Summary
df_work['Total_Characters'] = df_work['Session_Summary'].str.len()
df_work['Total_Words'] = df_work['Session_Summary'].str.split().apply(len)
df_work.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Timestamp         58 non-null     object 
 1   Username          58 non-null     object 
 2   RollNo            58 non-null     object 
 3   Session_Summary   58 non-null     object 
 4   Questions         0 non-null      float64
 5   Comments          2 non-null      object 
 6   Total_Characters  58 non-null     int64  
 7   Total_Words       58 non-null     int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 3.8+ KB


In [14]:
df_work.shape

(58, 8)

In [15]:
df_work.head()

Unnamed: 0,Timestamp,Username,RollNo,Session_Summary,Questions,Comments,Total_Characters,Total_Words
0,2025/01/22 10:57:42 AM GMT+5:30,u01,U01,the class started with a hands on demo. sir up...,,,1551,253
1,2025/01/22 10:58:35 AM GMT+5:30,u02,U02,learnt analysing data in excel and different n...,,,71,12
2,2025/01/22 11:07:54 AM GMT+5:30,u03,U03,"in todayâ€™s hands-on class, we worked with a ...",,,1034,159
3,2025/01/22 11:10:12 AM GMT+5:30,u04,U04,we first compared about the estimators being t...,,,1497,257
4,2025/01/22 11:25:40 AM GMT+5:30,u05,U05,when predicting with a given sample size using...,,,1484,254


In [16]:
# Preprocessing function
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

In [17]:
def preprocess_text(text):
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize, remove stopwords, and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

In [19]:
# Apply preprocessing
df_work['PP'] = df_work['Session_Summary'].apply(preprocess_text)

In [20]:
df_work.head()

Unnamed: 0,Timestamp,Username,RollNo,Session_Summary,Questions,Comments,Total_Characters,Total_Words,PP
0,2025/01/22 10:57:42 AM GMT+5:30,u01,U01,the class started with a hands on demo. sir up...,,,1551,253,class started hand demo sir uploaded file mood...
1,2025/01/22 10:58:35 AM GMT+5:30,u02,U02,learnt analysing data in excel and different n...,,,71,12,learnt analysing data excel different new term...
2,2025/01/22 11:07:54 AM GMT+5:30,u03,U03,"in todayâ€™s hands-on class, we worked with a ...",,,1034,159,today handson class worked dataset perform reg...
3,2025/01/22 11:10:12 AM GMT+5:30,u04,U04,we first compared about the estimators being t...,,,1497,257,first compared estimator statistic population ...
4,2025/01/22 11:25:40 AM GMT+5:30,u05,U05,when predicting with a given sample size using...,,,1484,254,predicting given sample size using linear regr...


In [None]:
def count_vectorize(df):
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform(df['PP']).toarray()
    df['count_vector'] = list(vectors)
    vocabulary = vectorizer.vocabulary_  # Get the vocabulary

    return vectors, vocabulary

# Word2Vec vectorization function
def word2vec_vectorize(df):
    tokenized = df['PP'].apply(lambda x: x.split())
    model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=1, workers=4)
    
    def vectorize(text):
        words = text.split()
        return np.mean([model.wv[word] for word in words if word in model.wv] or [np.zeros(100)], axis=0)

    df['word2vec'] = df['PP'].apply(vectorize)
    return np.array(df['word2vec'].tolist())

In [22]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vectorize(df, text_column='PP', max_features=1000):
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_vectors = tfidf_vectorizer.fit_transform(df[text_column]).toarray()
    df['tfidf_vector'] = list(tfidf_vectors)
    return tfidf_vectors

In [23]:
# Generate vectorizations
count_vectors, vocab = count_vectorize(df_work)
word2vec_vectors = word2vec_vectorize(df_work)
tfidf_vectors = tfidf_vectorize(df_work)

In [24]:
df_work.head()

Unnamed: 0,Timestamp,Username,RollNo,Session_Summary,Questions,Comments,Total_Characters,Total_Words,PP,count_vector,word2vec,tfidf_vector
0,2025/01/22 10:57:42 AM GMT+5:30,u01,U01,the class started with a hands on demo. sir up...,,,1551,253,class started hand demo sir uploaded file mood...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.009859608, 0.0382825, 0.00046141213, 0.001...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2025/01/22 10:58:35 AM GMT+5:30,u02,U02,learnt analysing data in excel and different n...,,,71,12,learnt analysing data excel different new term...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.008500115, 0.04282097, 0.003431929, 0.0037...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2025/01/22 11:07:54 AM GMT+5:30,u03,U03,"in todayâ€™s hands-on class, we worked with a ...",,,1034,159,today handson class worked dataset perform reg...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.009969514, 0.03970856, -0.000292617, 0.002...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2025/01/22 11:10:12 AM GMT+5:30,u04,U04,we first compared about the estimators being t...,,,1497,257,first compared estimator statistic population ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.009014022, 0.03320912, 0.0008952469, 0.002...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2025/01/22 11:25:40 AM GMT+5:30,u05,U05,when predicting with a given sample size using...,,,1484,254,predicting given sample size using linear regr...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.011406471, 0.043906376, 0.0012591495, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [25]:
vocabulary_list = vocab #sorted(list(vocab.keys()))
# Save the vocabulary to a text file
try:
    filename = "vocab.csv"
    with open(filename, 'w', encoding='utf-8') as f:
        for word in vocabulary_list:
            f.write(word + "," + str(vocab[word]) +'\n')  # Write each word on a new line
    print(f"Vocabulary saved to {filename}")
except Exception as e:
    print(f"Error saving vocabulary: {e}")

Vocabulary saved to vocab.csv


### word2vec Playground!

In [None]:
import gensim.downloader as api

# List available models
info = api.info()
print(info['models'].keys())

# Load a specific model
wv = api.load('word2vec-google-news-300')

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])


In [None]:
# Use the model
w1 = wv['bicycle']
w2  = wv['pedals']
w3 = wv['engine']

#print(w1)

In [None]:
result = w1 - w2 + w3
#print(result)

In [None]:
# Find the 5 most similar words
similar_words = wv.most_similar([result], topn=5)

# Print the results
for word, similarity in similar_words:
    print(f"{word}: {similarity}")