**# CBOW AND SKIPGRAM MODEL:**



DESCRIPTION: In the CBOW model, the distributed representations of context (or surrounding words) are combined to predict the word in the middle. While in the Skip-gram model, the distributed representation of the input word is used to predict the context.

In [None]:
# For Data Preprocessing
import pandas as pd

import gensim
from gensim.models import Word2Vec,KeyedVectors

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/NLP/revpre.csv")
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Review,Rating,label,Positive Feedback Count
0,0,0,'absolutely wonderful silky sexy comfortable ',4,1,0
1,1,1,'love dress sooo pretty happened find store im...,5,1,4
2,2,2,'high hopes dress really wanted work initially...,3,0,0
3,3,3,'love love love jumpsuit fun flirty fabulous e...,5,1,0
4,4,4,'this shirt flattering due adjustable front ti...,5,1,6


In [None]:
df['Review']= df['Review'].astype(str)

# Train CBOW Word2Vec Model



In [None]:
sentences = []
for review in df.Review.values:
    sentences.append(review.split())

In [None]:
num_features=100
model_cbow = Word2Vec(sentences, sg=0, min_count=10, workers=4, window =3, epochs = 20, vector_size=num_features)

In [None]:
import numpy as np
def make_feature_vec(words, model,num_features):
    # Function to average all of the word vectors in a given paragraph
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    for word in words:
        if word in model.wv.key_to_index:
            feature_vec = np.add(feature_vec, model.wv.get_vector(word))
            nwords += 1
    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

def get_avg_feature_vecs(reviews, model, num_features):
    # Function to generate vectors for all movie reviews in a dataset
    counter = 0
    review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
    for review in reviews:
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        counter += 1
    return review_feature_vecs

# Convert the training and test data into fixed-length feature vectors
data_vecs = get_avg_feature_vecs(sentences, model_cbow, num_features)

In [None]:
model_cbow.wv.most_similar("good")

[('great', 0.763328492641449),
 ('nice', 0.6237483620643616),
 ('excellent', 0.5457696318626404),
 ('decent', 0.542405903339386),
 ('fantastic', 0.5358585119247437),
 ('awesome', 0.5292093753814697),
 ('ok', 0.5275862216949463),
 ('okay', 0.5243579149246216),
 ('best', 0.5119372606277466),
 ('amazing', 0.5114009380340576)]

In [None]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []
    
    for word in model.wv.key_to_index:
        
        tokens.append(model.wv[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_vecs, df.label, test_size=0.2)

Training ml models

In [None]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


clf = GaussianNB()
clf.fit(scaled_train_embed, y_train)
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.38      0.79      0.51       834
           1       0.94      0.72      0.82      3864

    accuracy                           0.73      4698
   macro avg       0.66      0.76      0.67      4698
weighted avg       0.84      0.73      0.76      4698



In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(scaled_train_embed, y_train)
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.38      0.52       834
           1       0.88      0.98      0.93      3864

    accuracy                           0.87      4698
   macro avg       0.84      0.68      0.72      4698
weighted avg       0.87      0.87      0.85      4698



In [None]:
from sklearn.svm import SVC 
classifier = SVC(kernel='poly', random_state=0)  
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.60      0.61       834
           1       0.91      0.92      0.92      3864

    accuracy                           0.87      4698
   macro avg       0.77      0.76      0.76      4698
weighted avg       0.86      0.87      0.86      4698



In [None]:
classifier = SVC(kernel='linear', random_state=0)  
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.55      0.63       834
           1       0.91      0.96      0.93      3864

    accuracy                           0.89      4698
   macro avg       0.83      0.76      0.78      4698
weighted avg       0.88      0.89      0.88      4698



In [None]:
classifier = SVC(kernel='rbf', random_state=0)  
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.56      0.64       834
           1       0.91      0.96      0.93      3864

    accuracy                           0.89      4698
   macro avg       0.83      0.76      0.79      4698
weighted avg       0.88      0.89      0.88      4698



In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(scaled_train_embed, y_train)

In [None]:
y_pred = knn.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.36      0.48       834
           1       0.88      0.97      0.92      3864

    accuracy                           0.86      4698
   macro avg       0.79      0.66      0.70      4698
weighted avg       0.84      0.86      0.84      4698



# Train Skip-Gram Word2Vec Model

In [None]:
num_features=100
model_skipgram = Word2Vec(sentences, sg=1, min_count=10, workers=4, window =3, epochs = 20, vector_size=num_features)

In [None]:
model_skipgram.wv.most_similar("good")

[('great', 0.7387205362319946),
 ("'good", 0.6631464958190918),
 ('nice', 0.6092362403869629),
 ('bad', 0.5412139892578125),
 ('okay', 0.5135904550552368),
 ('terrific', 0.5058111548423767),
 ('fantastic', 0.5010529160499573),
 ('awesome', 0.49837467074394226),
 ('odd', 0.49555808305740356),
 ("'nice", 0.4855171740055084)]

In [None]:
import numpy as np

In [None]:
import numpy as np
def make_feature_vec(words, model,num_features):
    # Function to average all of the word vectors in a given paragraph
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    for word in words:
        if word in model.wv.key_to_index:
            feature_vec = np.add(feature_vec, model.wv.get_vector(word))
            nwords += 1
    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

def get_avg_feature_vecs(reviews, model, num_features):
    # Function to generate vectors for all movie reviews in a dataset
    counter = 0
    review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
    for review in reviews:
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        counter += 1
    return review_feature_vecs

# Convert the training and test data into fixed-length feature vectors
data_vecs_sg = get_avg_feature_vecs(sentences, model_cbow, num_features)

In [None]:
df['Review']=str(df['Review'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Review, df.label, test_size=0.2)

In [None]:
tsne_plot(model_cbow)

AttributeError: ignored

In [None]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


clf = GaussianNB()
clf.fit(scaled_train_embed, y_train)
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

ValueError: ignored

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(scaled_train_embed, y_train)
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       841
           1       0.82      1.00      0.90      3857

    accuracy                           0.82      4698
   macro avg       0.41      0.50      0.45      4698
weighted avg       0.67      0.82      0.74      4698



In [None]:
from sklearn.svm import SVC 
classifier = SVC(kernel='poly', random_state=0)  
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.16      0.07      0.10       841
           1       0.82      0.92      0.87      3857

    accuracy                           0.77      4698
   macro avg       0.49      0.50      0.48      4698
weighted avg       0.70      0.77      0.73      4698



In [None]:
classifier = SVC(kernel='linear', random_state=0)  
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       841
           1       0.82      1.00      0.90      3857

    accuracy                           0.82      4698
   macro avg       0.41      0.50      0.45      4698
weighted avg       0.67      0.82      0.74      4698



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
classifier = SVC(kernel='rbf', random_state=0)  
classifier.fit(scaled_train_embed, y_train)
y_pred = classifier.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       841
           1       0.82      1.00      0.90      3857

    accuracy                           0.82      4698
   macro avg       0.41      0.50      0.45      4698
weighted avg       0.67      0.82      0.74      4698



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(scaled_train_embed, y_train)

In [None]:
y_pred = knn.predict(scaled_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.16      0.04      0.06       841
           1       0.82      0.96      0.88      3857

    accuracy                           0.79      4698
   macro avg       0.49      0.50      0.47      4698
weighted avg       0.70      0.79      0.74      4698

