# NLP Assignment

Project Name: Sentiment Analysis 

Link to the Dataset:  https://www.kaggle.com/arbazkhan971/analyticvidhyadatasetsentiment?select=train_F3WbcTw.csv

In [None]:
#Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
import wordcloud
import nltk
from nltk import word_tokenize
import string

# Download the file and set it as a Dataframe.

In [None]:
#Reading both Train and Test dataset
df_train = pd.read_csv('train_F3WbcTw.csv')
df_test = pd.read_csv('test_tOlRoBf.csv')

### Checking the data inside both Train and Test dataset

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
# df_test.head()

In [None]:
df_test.shape

# Convert the text to lower case.

In [None]:
def textLower(text):
# Convert the text to lower case.
    text = text.lower()
    return text

In [None]:
df_train['text'] = df_train['text'].apply(textLower)

In [None]:
df_test['text'] = df_test['text'].apply(textLower)

In [None]:
df_train['text'].iloc[0] #checking lowercase text

In [None]:
df_test['text'].iloc[0] #checking lowercase text

### Tokenise Train and Test dataset

In [None]:
# Tokenise the train data
# nltk.download("punkt")
df_train['tokenized_text'] = df_train['text'].apply(word_tokenize)
print ("Tokenized Text for Train dataset: \n")
df_train['tokenized_text'].head()

In [None]:
# Tokenise the test data
df_test['tokenized_text'] = df_test['text'].apply(word_tokenize)
print ("Tokenized Text for Test dataset: \n")
df_test['tokenized_text'].head()

# Remove punctuations, special characters and stopwords from the text column.

### Remove Stopwords

In [None]:
#Extarcted a list of stopwords in Python NLTK
from nltk.corpus import stopwords
list1_stopWords = stopwords.words('english')
with open('stopwords.txt','r') as file: #extracted a stopwords list file from kaggle
    stopwords = file.read().splitlines()
    list2_stopWords = stopwords
total_stopWords = list1_stopWords + list2_stopWords
stop_words = list(set(total_stopWords))


In [None]:
#Remove Stopwords from both Train and Test dataset
def remove_SW(x):
    words = word_tokenize(x)
    wordsFiltered = list(set(words) - set(stop_words))
    return ' '.join(wordsFiltered)

In [None]:
df_train['text'] = df_train['text'].apply(lambda x: remove_SW(x))

In [None]:
df_test['text'] = df_test['text'].apply(lambda x: remove_SW(x))

In [None]:
df_train['text'].iloc[0] #checking if the stopwords was removed from Train set

In [None]:
df_test['text'].iloc[0] #checking if the stopwords was removed from Test set

### Remove Punctuations and Special Characters

In [None]:
# Function to remove punctuations from text
def remove_PuncChars(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [None]:
df_train['text'] = df_train['text'].apply(remove_PuncChars)

In [None]:
df_test['text'] = df_test['text'].apply(remove_PuncChars)

### Cleaning more data inside datasets

In [None]:
# #Removing Emails
def removEmails(x):
    pattern=r'\S+@\S+'
    x=re.sub(pattern,'',x)
    return x 

# #Removing URLs
def removeURL(x):
    pattern=r'http\S+\Swww\S+org\S'
    x=re.sub(pattern,'',x)
    return x

# #Removing html strips
def stripHTML(x):
    soup = BeautifulSoup(x, "html.parser")
    return soup.get_text()

# #Removing the square brackets
def removeSqBrackets(x):
    return re.sub('\[[^]]*\]', '', x)

# #Removing the noisy text
def noisyText(x):
    x = stripHTML(x)
    x = removeSqBrackets(x)
    return x

# #Removing numbers
def removeNum(x):
    pattern = r'\d+'
    x = re.sub(pattern, '', x)
    return x

# #Remove emojis
def removEmojis(x):
    emojiPattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # emoticons
                               "\U0001F300-\U0001F5FF"  # symbols & pictographs
                               "\U0001F680-\U0001F6FF"  # transport & map symbols
                               "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "\U00002500-\U00002BEF"  # chinese char
                               "\U00002702-\U000027B0"
                               "\U00002702-\U000027B0"
                               "\U000024C2-\U0001F251"
                               "\U0001f926-\U0001f937"
                               "\U00010000-\U0010ffff"
                               "\u2640-\u2642"
                               "\u2600-\u2B55"
                               "\u200d"
                               "\u23cf"
                               "\u23e9"
                               "\u231a"
                               "\ufe0f"  # dingbats
                               "\u3030"
                               "]+", flags=re.UNICODE)
    # Remove emojis from the text
    x = emojiPattern.sub(r'', x)
    return x

In [None]:
def cleanText(x):
    x = removEmails(x)
    x = removeURL(x)
    x = stripHTML(x)
    x = removeSqBrackets(x)
    x = noisyText(x)
    x = removeNum(x)
    x = removEmojis(x)
    return x

In [None]:
df_train['text'] = df_train['text'].apply(cleanText)

In [None]:
df_test['text'] = df_test['text'].apply(cleanText)

# Create two objects X and y. X will be the ' Text ' column dataframe and y will be the “Sentiment” column.

In [None]:
#rename text column
df_train.rename(columns={'text':'Text'},inplace = True)
df_test.rename(columns={'text':'Text'},inplace = True)

In [None]:
#rename sentiment column
df_train.rename(columns={'sentiment':'Sentiment'},inplace = True)

# Use TF-IDF and  CountVectorizer for word embeddings on the ‘Text’ column, display the embeddings.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


def findWordEmbeddings(x):
    tfidf_vectorizer = TfidfVectorizer()
    count_vectorizer = CountVectorizer()
    tfidf_embeddings = tfidf_vectorizer.fit_transform(x)
    count_embeddings = count_vectorizer.fit_transform(x)
    
    tfidf_embeddings_array = tfidf_embeddings.toarray()
    count_embeddings_array = count_embeddings.toarray()
    
    print('TF-IDF Embedding:\n')
    print(tfidf_embeddings_array)
    print('\nCount Embedding:\n')
    print(count_embeddings_array)
    print('\nUnique TF-IDF Embeddings:\n')
    print(np.unique(tfidf_embeddings.toarray()))
    print('\nUnique Count Embeddings:\n')
    print(np.unique(count_embeddings.toarray()))
    # print('\n Unique words that were encountered during the fitting process of the TfidfVectorizer:\n')
    # print(tfidf_vectorizer.vocabulary_)
    return tfidf_embeddings_array, count_embeddings_array

#Performing TF-IDF and Count on 10 samples of Train dataset
x_train = df_train['Text']
tfidf_embeddings, count_embeddings = findWordEmbeddings(x_train)

In [None]:
#Performing TF-IDF and Count on 10 samples of Test dataset
x_test = df_test['Text']
tfidf_embeddings, count_embeddings = findWordEmbeddings(x_test)

- When we have a very large dataset, particularly when using TF-IDF vectorization, the resulting matrix becomes sparse, meaning that most of the values are zeros. 
- When visualizing this matrix using PCA (Principal Component Analysis) with two components, the points representing the words are scattered all over the 2D plane of the graph. This makes it difficult to understand the relationships between the words.

- To address this issue and improve the visualization, t-SNE (t-Distributed Stochastic Neighbor Embedding) is often used. t-SNE helps arrange the points in a way that groups similar words together on the graph. 
- In other words, words that are clustered together on the graph are more likely to have similarities based on the selected features. 
- These similarities can be interpreted as semantic similarities, indicating that the words are related in meaning or context. 
- By using t-SNE, the visualization becomes more informative and facilitates the identification of word groups and patterns.

# Generate embeddings using CBOW and Skip-gram on the text using three different window sizes, display the embeddings using a visualization method.

In [None]:
# !pip install gensim
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
# Tokenise the clean text column in Train set
df_train['tokenized_text_clean'] = df_train['Text'].apply(word_tokenize)
print ("Tokenized Clean Text for Train dataset: \n")
df_train['tokenized_text_clean'].head()

In [None]:
df_train.head()

In [None]:
# Tokenise the clean text column in Test set
df_test['tokenized_text_clean'] = df_test['Text'].apply(word_tokenize)
print ("Tokenized Clean Text for Test dataset: \n")
df_test['tokenized_text_clean'].head()

In [None]:
from sklearn.manifold import TSNE

# Function to generate model (CBOW/Skipgram) using TSNE followed by PCA
def getEmbeddings(corpus:pd.Series, window, title, skipgram:bool):
    if skipgram:
        print("Creating SkipGram model")
        sg = 1
    else:
        print("Creating CBOW model")
        sg = 0
    corpus = corpus.tolist()
    embedding_model = Word2Vec(sentences=corpus, window=window, sg=sg, min_count=2)
    words_list = list(embedding_model.wv.key_to_index.keys())
    word_vectors = [embedding_model.wv.get_vector(word) for word in words_list]

    pca = PCA(n_components=2)
    word_embeddings_2d = pca.fit_transform(word_vectors)
    
    tsne = TSNE(n_components=2, perplexity=10, random_state=42)
    word_vec = tsne.fit_transform(word_embeddings_2d)

    plt.figure(figsize=(16, 9))
    plt.scatter(word_vec[:, 0], word_vec[:, 1], c='b', alpha=0.6)
    for i, word in enumerate(words_list):
        plt.annotate(word, alpha=0.5, xy=(word_vec[i, 0], word_vec[i, 1]), xytext=(5, 2),
                     textcoords='offset points', ha='right', va='bottom', size=4)
    plt.title(title)
    plt.show()
    return embedding_model

In [None]:
# Function to generate model (CBOW/Skipgram) using only PCA
def get_pca_Embeddings(corpus:pd.Series, window, title, skipgram:bool):
    if skipgram:
        print("Creating SkipGram model")
        sg = 1
    else:
        print("Creating CBOW model")
        sg = 0
    corpus = corpus.tolist()
    embedding_model = Word2Vec(sentences=corpus, window=window, sg=sg, min_count=2)
    words_list = list(embedding_model.wv.key_to_index.keys())
    word_vectors = [embedding_model.wv.get_vector(word) for word in words_list]

    pca = PCA(n_components=2)
    word_embeddings_2d = pca.fit_transform(word_vectors)

    plt.figure(figsize=(16, 9))
    plt.scatter(word_embeddings_2d[:, 0], word_embeddings_2d[:, 1], c='b', alpha=0.6)
    for i, word in enumerate(words_list):
        plt.annotate(word, alpha=0.5, xy=(word_embeddings_2d[i, 0], word_embeddings_2d[i, 1]), xytext=(5, 2),
                     textcoords='offset points', ha='right', va='bottom', size=4)
    plt.title(title)
    plt.show()
    return embedding_model

In [None]:
# Generating SkipGram model using TSNE followed by PCA by iterating over window size of 5,7,9
# corpus = df_train['tokenized_text_clean']
# sg_models = []
# for i in range(5,10,2):
#     model = getEmbeddings(corpus, window=i, title=f"SkipGram - windowSize {i}", skipgram=True)
#     sg_models.append(model)

In [None]:
# Generating CBOW model using TSNE followed by PCA by iterating over window size of 5,7,9
corpus = df_train['tokenized_text_clean']
cbow_models = []
for i in range(5,10,2):
    model = getEmbeddings(corpus, window=i, title=f"CBOW - windowSize {i}", skipgram=False)
    cbow_models.append(model)

In [None]:
# Generating CBOW model using PCA by iterating over window size of 5,7,9
corpus = df_train['tokenized_text_clean']
cbow_models_pca = []
for i in range(5,10,2):
    model = get_pca_Embeddings(corpus, window=i, title=f"CBOW - windowSize {i}", skipgram=False)
    cbow_models_pca.append(model)

In [None]:
# Generating SkipGram model using PCA by iterating over window size of 5,7,9
# corpus = df_train['tokenized_text_clean']
# sg_models_pca = []
# for i in range(5,10,2):
#     model = get_pca_Embeddings(corpus, window=i, title=f"SkipGram - windowSize {i}", skipgram=True)
#     sg_models_pca.append(model)

In [None]:
# Generating word vectors for each word from the CBOW model
for i in cbow_models:
    words = list(i.wv.key_to_index.keys())
    word_vectors = [i.wv.get_vector(word) for word in words]
    print(f"Model {i}",word_vectors)

In [None]:
# Generating word vectors for each word from the SkipGram model
for i in sg_models:
    words = list(i.wv.key_to_index.keys())
    word_vectors = [i.wv.get_vector(word) for word in words]
    print(f"Model {i}",word_vectors)

## CBOW (Continuous Bag of Words Model)

In [None]:
# Training and building vocab for each model of CBOW with TSNE followed by PCA
for i, model in enumerate(cbow_models):
    model.build_vocab(df_train['tokenized_text_clean'])
    model.train(df_train['tokenized_text_clean'], 
                total_examples=model.corpus_count, 
                epochs=5)
    print(f"Vocabulary for CBOW model - {i}",model.wv.index_to_key)


In [None]:
# Training and building vocab for each model of CBOW with PCA
for i, model in enumerate(cbow_models_pca):
    model.build_vocab(df_train['tokenized_text_clean'])
    model.train(df_train['tokenized_text_clean'], 
                total_examples=model.corpus_count, 
                epochs=5)
    print(f"Vocabulary for CBOW model - {i}",model.wv.index_to_key)

In [None]:
# Training and building vocab for each model of skipgram with TSNE followed by PCA
for i, model in enumerate(sg_models):
    model.build_vocab(df_train['tokenized_text_clean'])
    model.train(df_train['tokenized_text_clean'], 
                total_examples=model.corpus_count, 
                epochs=5)
    print(f"Vocabulary for SkipGram model - {i}",model.wv.index_to_key)

In [None]:
# Training and building vocab for each model of skipgram with PCA only
for i, model in enumerate(sg_models_pca):
    model.build_vocab(df_train['tokenized_text_clean'])
    model.train(df_train['tokenized_text_clean'], 
                total_examples=model.corpus_count, 
                epochs=5)
    print(f"Vocabulary for SkipGram model - {i}",model.wv.index_to_key)

 # Displaying the embeddings and Interpretting the results for predicting similar words to the most frequent word using the results of embeddings on different window sizes used in above task.

In [None]:
# Finding similar word embeddings for CBOW model using TSNE followed by PCA
most_frequent_word = cbow_models[0].wv.index_to_key[0]
similar_words = []
window_sizes = [5,7,9]
for model in cbow_models:
    similar_words.append(model.wv.most_similar(most_frequent_word))

for i, window_size in enumerate(window_sizes):
    print(f"Embeddings with Window Size {window_size}:")
    for word, similarity in similar_words[i]:
        print(f"- {word}: {similarity}")
    print('\n')

- The output of CBOW with TSNE and PCA shows that the words with the highest similarities are clustered together, regardless of the window size. For example, in the window size 5 output, the words "wwwniceorguk", "mid", and "blog" are all clustered together, as are the words "del", "carcinomatosis", and "kisqali". This suggests that these words are semantically related, even though they may not be adjacent to each other in the text.

- The window size does have some impact on the clustering, however. With a larger window size, more words are included in the context window, which can lead to more subtle relationships being captured. For example, in the window size 9 output, the words "genovese", "wwwniceorguk", and "display" are clustered together, suggesting that they are all related to clinical trials.

- Overall, the output of CBOW with TSNE and PCA shows that the model is able to learn meaningful relationships between words, even when those words are not adjacent to each other in the text. This can be useful for a variety of tasks, such as natural language understanding and machine translation.

- Here are some additional inferences that can be made from the output:

- The words "wwwniceorguk" and "mid" are often used in the context of healthcare, as they are both related to the National Institute for Health and Care Excellence (NICE).
- The words "blog", "strength", and "postpartum" are often used in the context of women's health.
- The words "del", "carcinomatosis", and "kisqali" are often used in the context of cancer treatment.
- The words "genovese", "wwwniceorguk", and "display" are often used in the context of clinical trials.

- These inferences can be used to improve the performance of natural language processing tasks, such as text classification and information retrieval. For example, if a text is about healthcare, the model can be more likely to classify it as such if it contains the words "wwwniceorguk" or "mid".

In [None]:
# Finding similar word embeddings for CBOW model using PCA
most_frequent_word = cbow_models_pca[0].wv.index_to_key[0]
similar_words = []
window_sizes = [5,7,9]
for model in cbow_models_pca:
    similar_words.append(model.wv.most_similar(most_frequent_word))

for i, window_size in enumerate(window_sizes):
    print(f"Embeddings with Window Size {window_size}:")
    for word, similarity in similar_words[i]:
        print(f"- {word}: {similarity}")
    print('\n')

- The output of the most similar words using CBOW with PCA shows that the words with the highest similarities are clustered together, regardless of the window size. For example, in the window size 5 output, the words "count", "strength", and "result" are all clustered together, as are the words "healthcare", "nivolumab", and "atazanavir". This suggests that these words are semantically related, even though they may not be adjacent to each other in the text.

- The window size does have some impact on the clustering, however. With a larger window size, more words are included in the context window, which can lead to more subtle relationships being captured. For example, in the window size 9 output, the words "intravitreous", "result", and "compulsory" are clustered together, suggesting that they are all related to the medical field.

- Overall, the output of the most similar words using CBOW with PCA shows that the model is able to learn meaningful relationships between words, even when those words are not adjacent to each other in the text. This can be useful for a variety of tasks, such as natural language understanding and machine translation.

- Here are some additional inferences that can be made from the output:

- The words "count", "strength", and "result" are often used in the context of clinical trials, as they are all related to the process of measuring the effectiveness of a treatment.
- The words "healthcare", "nivolumab", and "atazanavir" are often used in the context of cancer treatment, as they are all related to the treatment of cancer.
- The words "intravitreous", "result", and "compulsory" are often used in the context of eye care, as they are all related to the treatment of eye diseases.

- These inferences can be used to improve the performance of natural language processing tasks, such as text classification and information retrieval. For example, if a text is about clinical trials, the model can be more likely to classify it as such if it contains the words "count", "strength", or "result".

In [None]:
# Finding similar word embeddings for SkipGram model using TSNE followed by PCA
most_frequent_word = sg_models[0].wv.index_to_key[0]
similar_words = []
window_sizes = [5,7,9]
for model in sg_models:
    similar_words.append(model.wv.most_similar(most_frequent_word))

for i, window_size in enumerate(window_sizes):
    print(f"Embeddings with Window Size {window_size}:")
    for word, similarity in similar_words[i]:
        print(f"- {word}: {similarity}")
    print()

- The output of the Skipgram with TSNE and PCA shows that the words with the highest similarities are clustered together, regardless of the window size. For example, in the window size 5 output, the words "meeting", "blocks", and "healthcare" are all clustered together, as are the words "care", "past", and "count". This suggests that these words are semantically related, even though they may not be adjacent to each other in the text.

- The window size does have some impact on the clustering, however. With a larger window size, more words are included in the context window, which can lead to more subtle relationships being captured. For example, in the window size 9 output, the words "actually", "past", and "slowing" are clustered together, suggesting that they are all related to the concept of time.

- Overall, the output of the Skipgram with TSNE and PCA shows that the model is able to learn meaningful relationships between words, even when those words are not adjacent to each other in the text. This can be useful for a variety of tasks, such as natural language understanding and machine translation.

- Here are some additional inferences that can be made from the output:

- The words "meeting", "blocks", and "healthcare" are often used in the context of medical research or clinical trials, as they are all related to the process of conducting research.
- The words "care", "past", and "count" are often used in the context of patient care, as they are all related to the process of providing medical services.
- The words "actually", "past", and "slowing" are often used in the context of time, as they are all related to the concept of time.

- These inferences can be used to improve the performance of natural language processing tasks, such as text classification and information retrieval. For example, if a text is about medical research, the model can be more likely to classify it as such if it contains the words "meeting", "blocks", or "healthcare".

In [None]:
# Finding similar word embeddings for SkipGram model using PCA
most_frequent_word = sg_models_pca[0].wv.index_to_key[0]
similar_words = []
window_sizes = [5,7,9]
for model in sg_models_pca:
    similar_words.append(model.wv.most_similar(most_frequent_word))

for i, window_size in enumerate(window_sizes):
    print(f"Embeddings with Window Size {window_size}:")
    for word, similarity in similar_words[i]:
        print(f"- {word}: {similarity}")
    print()

- The output of the most recent words of Skipgram with PCA shows that the words with the highest similarities are clustered together, regardless of the window size. For example, in the window size 5 output, the words "result", "count", and "meeting" are all clustered together, as are the words "greater", "means", and "care". This suggests that these words are semantically related, even though they may not be adjacent to each other in the text.

- The window size does have some impact on the clustering, however. With a larger window size, more words are included in the context window, which can lead to more subtle relationships being captured. For example, in the window size 9 output, the words "individual", "past", and "diagnosis" are clustered together, suggesting that they are all related to the healthcare domain.

- Overall, the output of the most recent words of Skipgram with PCA shows that the model is able to learn meaningful relationships between words, even when those words are not adjacent to each other in the text. This can be useful for a variety of tasks, such as natural language understanding and machine translation.

- Here are some additional inferences that can be made from the output:

- The words "result", "count", and "meeting" are often used in the context of medical research, as they are all related to the process of collecting and analyzing data.
- The words "greater", "means", and "care" are often used in the context of healthcare, as they are all related to the provision of medical services.
- The words "individual", "past", and "diagnosis" are often used in the context of patient care, as they are all related to the process of understanding and treating a patient's condition.

- These inferences can be used to improve the performance of natural language processing tasks, such as text classification and information retrieval. For example, if a text is about medical research, the model can be more likely to classify it as such if it contains the words "result", "count", or "meeting".