In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import warnings
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Phase 1: Preprocessing

In [2]:
train_data = pd.read_csv("train.csv")
val_data = pd.read_csv("valid.csv")

In [3]:
train_data.drop(['Id', 'CreationDate', 'Y'], axis=1, inplace=True)
val_data.drop(['Id', 'CreationDate', 'Y'], axis=1, inplace=True)

In [4]:
train_data.head()

Unnamed: 0,Title,Body,Tags
0,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>
1,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>
2,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...
3,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...
4,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...


Remove HTML Tags and Noise

In [5]:
warnings.filterwarnings("ignore")

def clean_text(text, is_tags_column=False):

    if is_tags_column:
       text = re.sub(r'[^a-zA-Z]', ' ', text)  
       text = re.sub(r'\s+', ' ', text)  
       text = BeautifulSoup(text, "html.parser").get_text()  
       return text.lower()  

    else:
      text = BeautifulSoup(text, "html.parser").get_text()  
      text = re.sub(r'[^a-zA-Z]', ' ', text)  
      text = re.sub(r'\s+', ' ', text) 
      return text.lower()  

for df in [train_data, val_data]:
    df['Title'] = df['Title'].apply(clean_text)
    df['Body'] = df['Body'].apply(clean_text)
    df['Tags'] = df['Tags'].apply(lambda x: clean_text(x, is_tags_column=True))


In [6]:
train_data.head()

Unnamed: 0,Title,Body,Tags
0,java repeat task every random seconds,i m already familiar with repeating tasks ever...,java repeat
1,why are java optionals immutable,i d like to understand why java optionals we...,java optional
2,text overlay image with darkened opacity react...,i am attempting to overlay a title over an ima...,javascript image overlay react native opa...
3,why ternary operator in swift is so picky,the question is very simple but i just could ...,swift operators whitespace ternary operato...
4,hide show fab with scale animation,i m using custom floatingactionmenu i need to...,android material design floating action but...


Stopword Removal

In [7]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    return ' '.join(word for word in words if word not in stop_words)

for df in [train_data, val_data]:
    df['Title'] = df['Title'].apply(remove_stopwords)
    df['Body'] = df['Body'].apply(remove_stopwords)
    df['Tags'] = df['Tags'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
train_data.head()

Unnamed: 0,Title,Body,Tags
0,java repeat task every random seconds,already familiar repeating tasks every n secon...,java repeat
1,java optionals immutable,like understand java optionals designed immuta...,java optional
2,text overlay image darkened opacity react native,attempting overlay title image image darkened ...,javascript image overlay react native opacity
3,ternary operator swift picky,question simple could find answer return x hel...,swift operators whitespace ternary operator op...
4,hide show fab scale animation,using custom floatingactionmenu need implement...,android material design floating action button


Stemming

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_text(text):
    words = text.split()
    return ' '.join(stemmer.stem(word) for word in words)

for df in [train_data, val_data]:
    df['Title'] = df['Title'].apply(stem_text)
    df['Body'] = df['Body'].apply(stem_text)

In [None]:
train_data.head()

Unnamed: 0,Title,Body,Tags
0,java repeat task everi random second,alreadi familiar repeat task everi n second us...,java repeat
1,java option immut,like understand java option design immut threa...,java optional
2,text overlay imag darken opac react nativ,attempt overlay titl imag imag darken lower op...,javascript image overlay react native opacity
3,ternari oper swift picki,question simpl could find answer return x hell...,swift operators whitespace ternary operator op...
4,hide show fab scale anim,use custom floatingactionmenu need implement s...,android material design floating action button


In [10]:
#path to save the file
save_path_train = 'train_cleaned.csv'
save_path_val = 'validation_cleaned.csv'

# Save the DataFrame to a CSV file
train_data.to_csv(save_path_train, index=False)
val_data.to_csv(save_path_val, index=False)

# Phase 2: Word2Vec & Similarity Retrieval

In [98]:
train_data_cleaned = pd.read_csv('train_cleaned.csv')
val_data_cleaned = pd.read_csv('validation_cleaned.csv')

In [12]:
import nltk
from nltk.tokenize import word_tokenize

def Tokenize(data):
  nltk.download('punkt')
  nltk.download('punkt_tab') 

  # Convert 'Body', 'Title', and 'Tags' columns to string type before tokenization
  data['Body'] = data['Body'].astype(str)
  data['Title'] = data['Title'].astype(str)
  data['Tags'] = data['Tags'].astype(str)


  data['Body_Tokens'] = data['Body'].apply(word_tokenize)
  data['Title_Tokens'] = data['Title'].apply(word_tokenize)
  data['Tags_Tokens'] = data['Tags'].apply(word_tokenize)

  sentences = data['Body_Tokens'].tolist() + \
            data['Title_Tokens'].tolist() + \
            data['Tags_Tokens'].tolist()

  return sentences

In [101]:
sentences = Tokenize(train_data_cleaned)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Training on word vectors

In [14]:
from gensim.models import Word2Vec
word2vec_model = Word2Vec(
    sentences=sentences,   
    vector_size=200,      
    window=10,              
    min_count=5,           
    workers=4,             
    sg=0                   
)

In [None]:
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import random
import numpy as np
from plotly.offline import iplot
import plotly.io as pio

words = list(word2vec_model.wv.index_to_key)  

sample_words = random.sample(words, min(100, len(words)))

vectors = [word2vec_model.wv[word] for word in sample_words]

vectors = np.array(vectors) 

tsne_model = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne_model.fit_transform(vectors)

# Create an interactive 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers+text',
    marker=dict(size=5),
    text=sample_words,
    textposition='top center',
    textfont=dict(size=12)
)])

# Update layout for interactivity
fig.update_layout(
    title='Word Embeddings in 3D',
    scene=dict(
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Z',
        aspectmode='cube'
    ),
    margin=dict(l=0, r=0, b=0, t=40),  
    hovermode='closest',                
)

pio.renderers.default = "browser"
iplot(fig)

In [18]:
from gensim.models import Word2Vec

word = 'button' 

most_similar_words = word2vec_model.wv.most_similar(word, topn=10)

for similar_word, similarity_score in most_similar_words:
    print(f"Word: {similar_word}, Similarity: {similarity_score}")


Word: clicked, Similarity: 0.7310144901275635
Word: buttons, Similarity: 0.7285553216934204
Word: clicking, Similarity: 0.7075397372245789
Word: pressed, Similarity: 0.6710002422332764
Word: pressing, Similarity: 0.6512231826782227
Word: click, Similarity: 0.6445305347442627
Word: btn, Similarity: 0.6256078481674194
Word: tapped, Similarity: 0.6061328053474426
Word: mybtn, Similarity: 0.6058887243270874
Word: clicks, Similarity: 0.5834038257598877


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def preprocess(text):
    import re
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text


query = "How to train NLP model"
processed_query = preprocess(query)


vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(train_data_cleaned['Body'].tolist() + [processed_query])


cosine_similarities = cosine_similarity(vectors[-1], vectors[:-1]).flatten()


train_data_cleaned['Similarity'] = cosine_similarities


similar_questions = train_data_cleaned.sort_values(by='Similarity', ascending=False)

print(similar_questions[['Title','Body','Similarity']].head())


                                                   Title  \
12416              python linear regression predict date   
34598  error checking target expected dense dimension...   
40103                             use loop make variance   
18158                   r xgboost error building dmatrix   
314    find features names coefficients using scikit ...   

                                                    Body  Similarity  
12416  want predict value date future simple linear r...    0.267934  
34598  training model predict stock price input data ...    0.259586  
40103  change code train append train append train ap...    0.258844  
18158  trouble using xgboost r reading csv file data ...    0.255735  
314    training model model features sqft living bath...    0.255307  


Visualization of Document vectors

In [104]:
import numpy as np

def compute_mean_vector(tokens, model):
    
    vectors = [model.wv[word] for word in tokens if word in model.wv]

    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        
        return np.zeros(model.vector_size)


def add_combined_document_vectors(data, model):
    
    data['Combined_Vector'] = data.apply(
        lambda row: compute_mean_vector(row['Body_Tokens'] + row['Title_Tokens'], model), axis=1
    )
    
    data['Tags_Vector'] = data.apply(
        lambda row: compute_mean_vector(row['Tags_Tokens'], model), axis=1
    )
    
    return data


train_data_cleaned = add_combined_document_vectors(train_data_cleaned, word2vec_model)

In [106]:
print(train_data_cleaned.columns)

Index(['Title', 'Body', 'Tags', 'Body_Tokens', 'Title_Tokens', 'Tags_Tokens',
       'Combined_Vector', 'Tags_Vector'],
      dtype='object')


In [24]:
from sklearn.decomposition import PCA
import plotly.graph_objects as go
import numpy as np

document_vectors = np.array(train_data_cleaned['Combined_Vector'].tolist())

pca_model = PCA(n_components=3)
reduced_vectors = pca_model.fit_transform(document_vectors)

labels = train_data_cleaned.index.tolist()  

# Create a 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers+text',
    marker=dict(size=5),
    text=labels,
    textposition='top center',
    textfont=dict(size=8)
)])

# Update layout for better interactivity
fig.update_layout(
    title='Document Embeddings in 3D (PCA)',
    scene=dict(
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Z',
        aspectmode='cube'
    ),
    margin=dict(l=0, r=0, b=0, t=40),  
    hovermode='closest',                
)


fig.show()


# Phase 3: Tagging

In [88]:
val_data_cleaned = pd.read_csv('validation_cleaned.csv')
train_data_cleaned = pd.read_csv('train_cleaned.csv')

In [109]:
val_data_10_percent = val_data_cleaned.sample(frac=0.1, random_state=42)

In [57]:
sentences = Tokenize(train_data_cleaned)
sentences_val = Tokenize(val_data_cleaned)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [70]:
train_data_cleaned = add_combined_document_vectors(train_data_cleaned, word2vec_model)

In [92]:
val_data_10_percent = add_combined_document_vectors(val_data_10_percent, word2vec_model)

In [131]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MultiLabelBinarizer


X = np.vstack(train_data_cleaned['Combined_Vector'].values) 

y  = train_data_cleaned['Tags']

# To check the dimentions of data and labels
print("Shape of X:", X.shape)  
print("Shape of y:", y.shape)  

knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn.fit(X, y)


Shape of X: (45000, 200)
Shape of y: (45000,)


In [145]:
from sklearn.neighbors import KNeighborsClassifier

distances, indices = knn.kneighbors(X_val)
successful_matches_count = 0
unsuccessful_matches_count = 0
total_matches_count = len(X_val)  
successful_matches = []
unsuccessful_matches = []


def split_tags(tags):
    return set(tags.split())  


for i, (dist, idx) in enumerate(zip(distances, indices)):
    
    actual_tags = split_tags(val_data_10_percent.iloc[i]['Tags'])
    
    neighbor_tags = set()
    for tag_list in train_data_cleaned.iloc[idx]['Tags'].values:
        neighbor_tags.update(split_tags(tag_list))  
    
    
    if any(tag in neighbor_tags for tag in actual_tags):  
        successful_matches_count += 1
        successful_matches.append({
            "Sample Index": i + 1,
            "Actual Tags": actual_tags,
            "Nearest Neighbor Tags": neighbor_tags,
            "Distances": dist
        })
    else:
        unsuccessful_matches_count += 1
        unsuccessful_matches.append({
            "Sample Index": i + 1,
            "Actual Tags": actual_tags,
            "Nearest Neighbor Tags": neighbor_tags,
            "Distances": dist
        })

# Calculate accuracy
accuracy = (successful_matches_count / total_matches_count) * 100

print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 85.67%


In [146]:
print("\nExample Successful Matches:")
for match in successful_matches[:5]:  
    print(f"Sample {match['Sample Index']} - Actual Tags: {match['Actual Tags']}")
    print(f"Sample {match['Sample Index']} - Nearest Neighbor Tags: {match['Nearest Neighbor Tags']}")    
    print()


print("\nExample Unsuccessful Matches:")
for match in unsuccessful_matches[:5]:  
    print(f"Sample {match['Sample Index']} - Actual Tags: {match['Actual Tags']}")
    print(f"Sample {match['Sample Index']} - Nearest Neighbor Tags: {match['Nearest Neighbor Tags']}")
    print()


Example Successful Matches:
Sample 1 - Actual Tags: {'javascript'}
Sample 1 - Nearest Neighbor Tags: {'arrays', 'ecmascript', 'multidimensional', 'jquery', 'array', 'javascript', 'sorting', 'php'}

Sample 2 - Actual Tags: {'sql'}
Sample 2 - Nearest Neighbor Tags: {'mysql', 'server', 'vb', 'net', 'g', 'treeview', 'sql', 'select', 'oracle'}

Sample 3 - Actual Tags: {'c'}
Sample 3 - Nearest Neighbor Tags: {'eclipse', 'function', 'static', 'methods', 'boolean', 'c', 'java', 'string', 'split'}

Sample 4 - Actual Tags: {'jquery', 'html', 'twitter', 'ajax', 'bootstrap', 'javascript'}
Sample 4 - Nearest Neighbor Tags: {'css', 'jquery', 'html', 'twitter', 'bootstrap', 'javascript'}

Sample 5 - Actual Tags: {'console', 'search', 'youtube', 'data', 'google', 'structured', 'schema'}
Sample 5 - Nearest Neighbor Tags: {'tags', 'ranking', 'api', 'google', 'hide', 'seo', 'android', 'meta', 'analytics', 'joomla', 'javascript', 'key', 'maps'}


Example Unsuccessful Matches:
Sample 9 - Actual Tags: {'ep