# Importing Libraries

In [32]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords, cmudict
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv
/kaggle/input/wordembeddings/GoogleNews-vectors-negative300.bin


# Data Loading

In [3]:
dataset_train_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv'
dataset_test_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv'

In [6]:
df_train = pd.read_csv(dataset_train_path)
df_test = pd.read_csv(dataset_test_path)

# Preprocessing

In [10]:
def remove_urls(text):

    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def data_preprocessing(df):

    for index, row in df.iterrows():
        text_value = row['full_text']

        
        text_value = text_value.lower()
        text_value = remove_urls(text_value)
        text_value = re.sub(r'[^\w\s]', '', text_value)

        stop_words = set(stopwords.words('english'))
        text_value = ' '.join([word for word in text_value.split() if word not in stop_words])

#         lemmatizer = WordNetLemmatizer()
#         text_value = ' '.join([lemmatizer.lemmatize(word) for word in text_value.split()])

        df.at[index, 'full_text_preprocessed'] = text_value

    return df

In [11]:
df_train = data_preprocessing(df_train)
df_test = data_preprocessing(df_test)

# Latent Direchlet Allocation

In [12]:
# Tokenize the preprocessed text data
df_train['tokens'] = df_train['full_text_preprocessed'].apply(word_tokenize)
df_test['tokens'] = df_test['full_text_preprocessed'].apply(word_tokenize)

In [13]:
# Create a dictionary from the tokenized text data
dictionary_train = corpora.Dictionary(df_train['tokens'])
dictionary_test = corpora.Dictionary(df_test['tokens'])

# Create a corpus for the LDA model
corpus_train = [dictionary_train.doc2bow(tokens) for tokens in df_train['tokens']]
corpus_test = [dictionary_test.doc2bow(tokens) for tokens in df_test['tokens']]

In [14]:
# Train the LDA model
lda_model_train = LdaModel(corpus=corpus_train, id2word=dictionary_train, num_topics=10, passes=10, random_state=42)
lda_model_test = LdaModel(corpus=corpus_test, id2word=dictionary_test, num_topics=10, passes=10, random_state=42)

In [16]:
# Visualize the topics using pyLDAvis for the training data
pyLDAvis.enable_notebook()
vis_train = gensimvis.prepare(lda_model_train, corpus_train, dictionary_train)
pyLDAvis.display(vis_train)

In [17]:
# Get the topic distribution for each document in the training set
topic_distribution_train = [lda_model_train.get_document_topics(doc) for doc in corpus_train]

# Get the topic distribution for each document in the test set
topic_distribution_test = [lda_model_test.get_document_topics(doc) for doc in corpus_test]

# Function to convert topic distribution to percentages
def topic_distribution_to_percentages(topic_distribution):
    percentages = []
    for doc_topics in topic_distribution:
        topic_percentages = [0] * lda_model_train.num_topics
        for topic, percentage in doc_topics:
            topic_percentages[topic] = percentage
        percentages.append(topic_percentages)
    return percentages

# Convert topic distributions to percentages
train_topic_percentages = topic_distribution_to_percentages(topic_distribution_train)
test_topic_percentages = topic_distribution_to_percentages(topic_distribution_test)

# Add the topic percentages to the DataFrame
for i in range(lda_model_train.num_topics):
    df_train[f'topic_{i+1}_percentage'] = [topic_percentages[i] for topic_percentages in train_topic_percentages]
    df_test[f'topic_{i+1}_percentage'] = [topic_percentages[i] for topic_percentages in test_topic_percentages]


In [18]:
df_train.head()

Unnamed: 0,essay_id,full_text,score,full_text_preprocessed,tokens,topic_1_percentage,topic_2_percentage,topic_3_percentage,topic_4_percentage,topic_5_percentage,topic_6_percentage,topic_7_percentage,topic_8_percentage,topic_9_percentage,topic_10_percentage
0,000d118,Many people have car where they live. The thin...,3,many people car live thing dont know use car a...,"[many, people, car, live, thing, dont, know, u...",0.0,0.0,0.0,0.0,0.0,0.185021,0.811567,0.0,0.0,0.0
1,000fe60,I am a scientist at NASA that is discussing th...,3,scientist nasa discussing face mars explaining...,"[scientist, nasa, discussing, face, mars, expl...",0.0,0.667994,0.0,0.0,0.0,0.325985,0.0,0.0,0.0,0.0
2,001ab80,People always wish they had the same technolog...,4,people always wish technology seen movies best...,"[people, always, wish, technology, seen, movie...",0.184914,0.0,0.0,0.0,0.0,0.136212,0.043238,0.633417,0.0,0.0
3,001bdc0,"We all heard about Venus, the planet without a...",4,heard venus planet without almost oxygen earth...,"[heard, venus, planet, without, almost, oxygen...",0.026821,0.0,0.02149,0.0,0.0,0.047123,0.0,0.0,0.0,0.902131
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,dear state senator letter argue favor keeping ...,"[dear, state, senator, letter, argue, favor, k...",0.0,0.0,0.0,0.0,0.994997,0.0,0.0,0.0,0.0,0.0


In [19]:
# Get the top words for each topic
top_words_per_topic = []
for topic_id in range(lda_model_train.num_topics):
    top_words = [word for word, _ in lda_model_train.show_topic(topic_id)]
    top_words_per_topic.append(top_words)

# Print the top words for each topic
for i, top_words in enumerate(top_words_per_topic):
    print(f"Topic {i+1}: {', '.join(top_words)}")

Topic 1: many, may, however, technology, future, new, society, although, become, due
Topic 2: face, mars, landform, aliens, natural, nasa, picture, alien, like, made
Topic 3: students, technology, could, emotions, facial, help, computer, student, system, would
Topic 4: computor, awsome, app, becaues, thta, unwanted, throws, happing, chang, instruction
Topic 5: electoral, vote, college, president, states, electors, votes, people, popular, state
Topic 6: people, would, think, like, get, dont, know, could, want, going
Topic 7: car, cars, people, usage, driving, pollution, air, smog, less, day
Topic 8: cars, car, driverless, would, driver, driving, drive, could, people, also
Topic 9: seagoing, animals, luke, program, cowboys, people, help, also, cowboy, get
Topic 10: venus, planet, author, earth, would, also, surface, could, humans, dangers


# Word Embeddings for Similarity Calculation

In [28]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    # Tokenize
    tokens = word_tokenize(text)
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    # Remove punctuation
    tokens = [word for word in tokens if word.isalpha()]
    # Remove stopwords
    tokens = [word for word in tokens if not word in stop_words]
    return tokens

In [22]:
word_vectors = KeyedVectors.load_word2vec_format('../input/wordembeddings/GoogleNews-vectors-negative300.bin', binary=True)

In [23]:
def text_to_embeddings(text):
    tokens = preprocess_text(text)
    embeddings = []
    for token in tokens:
        if token in word_vectors:
            embeddings.append(word_vectors[token])
    if not embeddings:
        # If no embeddings found, return zeros
        return np.zeros(300)
    else:
        # Average the embeddings for all tokens
        return np.mean(embeddings, axis=0)

In [29]:
df_train['google_word2vec'] = df_train['full_text_preprocessed'].apply(text_to_embeddings)

In [30]:
df_test['google_word2vec'] = df_test['full_text_preprocessed'].apply(text_to_embeddings)

# Building Similarity Features

In [39]:

def calculate_similarity(doc_embeddings, other_doc_embeddings):
    # Calculate the cosine similarity between the two document embeddings
    similarity = cosine_similarity([doc_embeddings], [other_doc_embeddings])[0][0]
    return similarity

def calculate_weighted_similarity(doc_topic_scores, other_doc_topic_scores):
    # Calculate the weighted similarity based on topic scores
    weighted_similarity = sum(topic1 * topic2 for topic1, topic2 in zip(doc_topic_scores, other_doc_topic_scores))
    return weighted_similarity

def fill_similarity_features(df):
    # Initialize SIM_1 to SIM_6 columns with zeros
    for i in range(1, 7):
        df[f'SIM_{i}'] = 0.0
    
    finished = 0
    
    # Iterate over each document
    for idx, row in df.iterrows():
        doc_embeddings = row['google_word2vec']
        doc_topic_scores = row[['topic_1_percentage', 'topic_2_percentage', 'topic_3_percentage', 'topic_4_percentage', 'topic_5_percentage', 'topic_6_percentage', 'topic_7_percentage', 'topic_8_percentage', 'topic_9_percentage', 'topic_10_percentage']]


        # Initialize sums for each SIM feature
        sim_sums = [0.0] * 6

        # Iterate over all other documents 
        for other_idx, other_row in df.iterrows():
            if idx == other_idx:
                continue  # Skip if the same document

            other_doc_embeddings = other_row['google_word2vec']
            other_doc_topic_scores = other_row[['topic_1_percentage', 'topic_2_percentage', 'topic_3_percentage', 'topic_4_percentage', 'topic_5_percentage', 'topic_6_percentage', 'topic_7_percentage', 'topic_8_percentage', 'topic_9_percentage', 'topic_10_percentage']]
            score_of_doc = other_row['score'] - 1
            
            # Calculate cosine similarity between the documents
            similarity = calculate_similarity(doc_embeddings, other_doc_embeddings)

            # Calculate weighted similarity based on topic scores
            weighted_similarity = calculate_weighted_similarity(doc_topic_scores, other_doc_topic_scores)
            
            # Final Updation
            sim_sums[score_of_doc] += similarity * weighted_similarity
            

        # Fill the SIM_1 to SIM_6 columns for the current document
        for i in range(1, 7):
            df.at[idx, f'SIM_{i}'] = sim_sums[i-1]
            
        finished += 1
            
        print(finished)
        print("\n")

    return df

In [40]:
df_train = fill_similarity_features(df_train)

1


2


3


4


5


6


7


8




KeyboardInterrupt: 