<a href="https://colab.research.google.com/github/shriya-tiwari/NLP-B_Tech_Project/blob/main/Embeddings_%26_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Continuous Bag of Words (CBOW)
predicts the current word given context words within a specific window

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
sample = open("/content/sample_data/sample_text.txt")
s = sample.read().replace("\n", " ")
s = re.sub(r'[^\w\s]', '', s)
sw = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [None]:
data = []
for i in sent_tokenize(s):
    temp = []
    for j in word_tokenize(i):
      j = j.lower()
      j = lemmatizer.lemmatize(j)
      if j not in sw:
        temp.append(j)
    # print(temp)

    data.append(temp)

In [None]:
print(data)

[['dynamic', 'realm', 'technological', 'advancement', 'innovation', 'stand', 'driving', 'force', 'intertwining', 'technology', 'shape', 'landscape', 'progress', 'symbiotic', 'relationship', 'technology', 'innovation', 'evident', 'every', 'facet', 'modern', 'existence', 'witness', 'evolution', 'smartphones', 'artificial', 'intelligence', 'internet', 'thing', 'becomes', 'clear', 'technology', 'serf', 'canvas', 'upon', 'innovation', 'paint', 'profound', 'stroke', 'breakthrough', 'technology', 'often', 'herald', 'leap', 'innovation', 'vice', 'versa', 'creating', 'continuous', 'loop', 'advancement', 'consider', 'transformative', 'impact', 'technology', 'healthcare', 'innovation', 'medical', 'device', 'telemedicine', 'data', 'analytics', 'revolutionizing', 'patient', 'care', 'technology', 'becomes', 'enabler', 'innovation', 'fostering', 'novel', 'approach', 'diagnosis', 'treatment', 'healthcare', 'delivery', 'business', 'landscape', 'fusion', 'technology', 'innovation', 'driving', 'factor', 

In [None]:
#CBOW model with vector_size = 100 and window size = 5
model_CBOW = gensim.models.Word2Vec(data, min_count = 1, vector_size = 150, window = 5)

In [None]:
model_CBOW.wv.most_similar('education')

[('knowledge', 0.22922463715076447),
 ('educational', 0.20184476673603058),
 ('yet', 0.1762729287147522),
 ('imparted', 0.1481236219406128),
 ('factor', 0.14173272252082825),
 ('advancement', 0.1357717216014862),
 ('reality', 0.13518062233924866),
 ('way', 0.1329987347126007),
 ('often', 0.1326533406972885),
 ('summary', 0.13243314623832703)]

In [None]:
print("Cosine similarity between 'advancement' " + "and 'education' - CBOW : ",
    model_CBOW.wv.similarity('advancement', 'education'))

Cosine similarity between 'advancement' and 'education' - CBOW :  0.13577172


# Skip Gram Model
predicts the surrounding context words within specific window given current word

In [None]:
model_SG = gensim.models.Word2Vec(data, min_count = 1, vector_size = 100, window = 5, sg = 1)

print("Cosine similarity between 'advancement' " +  "and 'education' - Skip Gram : ", model_SG.wv.similarity('advancement', 'education'))


Cosine similarity between 'advancement' and 'education' - Skip Gram :  0.07274586


In [None]:
model_SG.wv.most_similar('education')

[('provides', 0.20022395253181458),
 ('word', 0.1918536126613617),
 ('digital', 0.16173812747001648),
 ('collaboration', 0.1552039533853531),
 ('thing', 0.15253496170043945),
 ('virtual', 0.15200799703598022),
 ('method', 0.14484526216983795),
 ('embrace', 0.13881103694438934),
 ('becomes', 0.1368793249130249),
 ('serf', 0.1344384402036667)]

## Sentiment Analysis using TFIDF and neural network

In [None]:
import random
import seaborn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

seaborn.set(style='whitegrid'); seaborn.set_context('talk')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
import pandas
from pandas.plotting import scatter_matrix

dataset = pandas.read_csv('/content/amazon_cells_labelled.csv')

In [None]:
X = dataset['statement']
y = dataset['sentiment']
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.head())

29                                  Doesn't hold charge.
535    All it took was one drop from about 6 inches a...
695           Do NOT buy if you want to use the holster.
557    I have purchased these for both family and fri...
836                        Horrible, horrible protector.
Name: statement, dtype: object


In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
X_train_final = X_train_tfidf.toarray()
X_test_final = X_test_tfidf.toarray()

In [None]:
X_train_final = pd.DataFrame(X_train_final)
X_test_final = pd.DataFrame(X_test_final)

In [None]:
class MLP:
    def __init__(self, input_dim, hidden_dim, output_dim, learning_rate, epochs):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.learning_rate = learning_rate
        self.epochs = epochs

        # Initialize weights and biases
        self.weights_input_hidden = np.random.randn(self.input_dim, self.hidden_dim)
        self.bias_hidden = np.zeros((1, self.hidden_dim))
        self.weights_hidden_output = np.random.randn(self.hidden_dim, self.output_dim)
        self.bias_output = np.zeros((1, self.output_dim))

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        return x * (1 - x)

    def train(self, X, y):
        for epoch in range(self.epochs):
            # Forward pass
            hidden_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden
            hidden_output = self.sigmoid(hidden_input)
            output_input = np.dot(hidden_output, self.weights_hidden_output) + self.bias_output
            output = self.sigmoid(output_input)

            # Backpropagation
            error = y - output
            d_output = error * self.sigmoid_derivative(output)
            error_hidden = d_output.dot(self.weights_hidden_output.T)
            d_hidden = error_hidden * self.sigmoid_derivative(hidden_output)

            # Update weights and biases
            self.weights_hidden_output += hidden_output.T.dot(d_output) * self.learning_rate
            self.bias_output += np.sum(d_output, axis=0, keepdims=True) * self.learning_rate
            self.weights_input_hidden += X.T.dot(d_hidden) * self.learning_rate
            self.bias_hidden += np.sum(d_hidden, axis=0, keepdims=True) * self.learning_rate

    def predict(self, X):
        hidden_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden
        hidden_output = self.sigmoid(hidden_input)
        output_input = np.dot(hidden_output, self.weights_hidden_output) + self.bias_output
        output = self.sigmoid(output_input)
        return output

# Initialize and train the MLP
input_dim = X_train_tfidf.shape[1]  # Number of TF-IDF features
hidden_dim = 500
output_dim = 1  # Binary classification
learning_rate = 0.1
epochs = 1000

mlp = MLP(input_dim, hidden_dim, output_dim, learning_rate, epochs)
mlp.train(X_train_final, y_train.values.reshape(-1, 1))

# Predict on the test data
y_pred = mlp.predict(X_test_final)

# Evaluate the model (you can use various metrics)
accuracy = np.mean((y_pred > 0.5) == y_test.values.reshape(-1, 1))
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.54
