In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup
import re
import nltk
nltk.download("all")

In [None]:
!pip install beautifulsoup4

# Unzip folders

In [None]:
!unzip /kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip
!unzip /kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip
!unzip  /kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip

# Read the data

In [None]:
train=pd.read_csv("./labeledTrainData.tsv",delimiter="\t",quoting=3)

Here, "header=0" indicates that the first line of the file contains column names, "delimiter=\t" indicates that the fields are separated by tabs, and quoting=3 tells Python to ignore doubled quotes, otherwise you may encounter errors trying to read the file.

In [None]:
train.head()

In [None]:
#Sample review
print(train['review'][0])

# Text Pre processing Pipeline

In [None]:
from nltk.corpus import stopwords
stopwords=stopwords.words("english")

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()



In [None]:
def clean_tweets(raw_text,stopwords=stopwords):
    '''Golden function for cleaning text data'''
    
    # Removing HTML Tags
    html_removed_text=BeautifulSoup(raw_text).get_text()
    
    # Remove any non character
    character_only_text=re.sub("[^a-zA-Z]"," ",html_removed_text)
    
    # Lowercase and split
    lower_text=character_only_text.lower().split()
    
    #Get STOPWORDS and remove
    stop_remove_text=[i for i in lower_text if not i in stopwords]
    
    #Lemmatization
    lemma_removed_text=[wordnet_lemmatizer.lemmatize(word,'v') for word in stop_remove_text]
    
    # Remove one character words
#     lemma_removed_text=[word for word in stop_remove_text if len(word)>1]
    
    return " ".join(lemma_removed_text)
    

In [None]:
# check on sample
train.loc[:1,"review"].apply(clean_tweets)[0]

In [None]:
# orginal Review
train.loc[0,"review"]

In [None]:
train['clean_review']=train['review'].apply(clean_tweets)

In [None]:
train.head()

In [None]:
from collections import Counter
word_counter=Counter(" ".join(train['clean_review'].tolist()).split())

In [None]:
word_counter.most_common(4)

In [None]:
#Top Words in negative reviews
negative_word_counter=Counter(" ".join(train.loc[train['sentiment']==1,"clean_review"].tolist()).split())

#Top words in positive reviews
positive_word_counter=Counter(" ".join(train.loc[train['sentiment']==0,"clean_review"].tolist()).split())

In [None]:
negative_word_counter.most_common(10)

In [None]:
positive_word_counter.most_common(10)

In [None]:
# Baseline Model
# If you are seeing a high overlap in unigram between two categories(here its positive or negative)
# Then the next thing you should try is to look for bigrams or trigrams

# Bag of Words - Model

## Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

## Split the data

In [None]:
X=train['clean_review'] #Predictors
y=train['sentiment'] #Target

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [None]:
def create_vector(vectorizer,data):
    '''Pass vectorizer and data'''
    train_vector=vectorizer.transform(data.tolist())
    return train_vector.toarray()
    

In [None]:
vectorizer = CountVectorizer(max_features=1000)
vectorizer.fit(X_train.tolist())

In [None]:
X_train_vector=create_vector(vectorizer,X_train)
X_test_vector=create_vector(vectorizer,X_test)

In [None]:
X_test_vector.shape, X_train_vector.shape

# Create ML Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# TRY with XGBOOST,SVM

In [None]:
forest=RandomForestClassifier()
forest.fit(X_train_vector,y_train)


y_pred=forest.predict(X_test_vector)
print(classification_report(y_test,y_pred))

# Lets make a submission

In [None]:
test=pd.read_csv("./testData.tsv",delimiter="\t")
test['clean_review']=test['review'].apply(clean_tweets)

In [None]:
test_feature_vector=create_vector(vectorizer,test['clean_review'])
test_predictions=forest.predict(test_feature_vector)

test['sentiment']=test_predictions
test[['id','sentiment']].to_csv("submission_file_rf_count.csv",index=False)

## TFIDF Vectorizer

In [None]:
# ----- PLEASE TRY THIS ------

# Word2Vec Model

In [None]:
from gensim.models import Word2Vec

train_unlabelled=pd.read_csv("./unlabeledTrainData.tsv",delimiter="\t",quoting=3)
train_unlabelled['clean_review']=train_unlabelled['review'].apply(clean_tweets)

In [None]:
sentences=[]
sentences.extend(train['clean_review'])
sentences.extend(test['clean_review'])
sentences.extend(train_unlabelled['clean_review'])

In [None]:
#remove duplicate sentences,if any
sentences=list(set(sentences))

In [None]:
len(sentences)

# Train a custom word2vec model

## The parameters:

* `min_count` <font color='purple'>=</font> <font color='green'>int</font> - Ignores all words with total absolute frequency lower than this - (2, 100)


* `window` <font color='purple'>=</font> <font color='green'>int</font> - The maximum distance between the current and predicted word within a sentence. E.g. `window` words on the left and `window` words on the left of our target - (2, 10)


* `size` <font color='purple'>=</font> <font color='green'>int</font> - Dimensionality of the feature vectors. - (50, 300)


* `sample` <font color='purple'>=</font> <font color='green'>float</font> - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial.  - (0, 1e-5)


* `alpha` <font color='purple'>=</font> <font color='green'>float</font> - The initial learning rate - (0.01, 0.05)


* `min_alpha` <font color='purple'>=</font> <font color='green'>float</font> - Learning rate will linearly drop to `min_alpha` as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00


* `negative` <font color='purple'>=</font> <font color='green'>int</font> - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)


* `workers` <font color='purple'>=</font> <font color='green'>int</font> - Use these many worker threads to train the model (=faster training with multicore machines)

In [None]:
sentences=[i.split() for i in sentences]

In [None]:
sentences[0]

In [None]:
#Beginner
del w2v_model
w2v_model = Word2Vec(sentences=sentences,min_count=20,
                     window=2,
                     vector_size=100,
                     workers=-1)
w2v_model.wv.most_similar("great")

In [None]:
# For Advance users , we create in three steps
import multiprocessing
cores = multiprocessing.cpu_count()

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:

# w2v_model = Word2Vec(min_count=20,
#                      window=2,
#                      vector_size=300,
#                      sample=6e-5, 
#                      alpha=0.03, 
#                      min_alpha=0.0007, 
# #                      negative=20,
#                      workers=-1)

## Building the Vocabulary Table:
Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [None]:
from time import time
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

## Training of the model:
_Parameters of the training:_
* `total_examples` <font color='purple'>=</font> <font color='green'>int</font> - Count of sentences;
* `epochs` <font color='purple'>=</font> <font color='green'>int</font> - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [None]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
w2v_model.wv.most_similar("leave")

In [None]:
w2v_model.wv.most_similar(positive=["home"])

In [None]:
w2v_model.wv.similarity("stupid", 'worse')

In [None]:
w2v_model.wv.doesnt_match(['great', 'stupid', 'good'])

### t-SNE visualizations:
t-SNE is a non-linear dimensionality reduction algorithm that attempts to represent high-dimensional data and the underlying relationships between vectors in a lower-dimensional space.<br>
Here is a good tutorial on it: https://medium.com/@luckylwk/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

Our goal in this section is to plot our 300 dimensions vectors into 2 dimensional graphs, and see if we can spot interesting patterns.<br>
For that we are going to use t-SNE implementation from scikit-learn.

To make the visualizations more relevant, we will look at the relationships between a query word (in <font color='red'>**red**</font>), its most similar words in the model (in <font color="blue">**blue**</font>), and other words from the vocabulary (in <font color='green'>**green**</font>).

In [None]:
def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, 300), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 300 to 50 dimensions with PCA
    reduc = PCA(n_components=14,svd_solver='full').fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))
    

In [None]:
tsnescatterplot(w2v_model, 'good',  [i[0] for i in w2v_model.wv.most_similar(positive=["bad"])])

In [None]:
num_features=300



In [None]:
def get_vectors(model,sentence):
    
    '''Get sentence vectors'''
    
    vectors=[]
    for i in sentence.split():
        try:
            vectors.append(model.wv[i])
        except:
            continue
    return np.average(vectors,axis=0)
            
        

In [None]:
get_vectors(w2v_model,"this is good today okay thats fine")

In [None]:
w2v_model.wv['aayush']

In [None]:
get_vectors(w2v_model,"aayush is a good actor and prove his skills")

In [None]:
def get_doc_vectors(model,documents,num_features=300):
    
    # Initialize a counter
    counter = 0
    
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(documents),num_features),dtype="float32")
    
    # Loop through the reviews
    for sentence in documents:
        # Print a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d" % (counter, len(documents)))
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = get_vectors(model,sentence)
        
        # Increment the counter
        counter = counter + 1
    return reviewFeatureVecs
    

In [None]:
documents=["hey","hey this"]

In [None]:
#sample
get_doc_vectors(w2v_model,documents,num_features=300).shape

In [None]:
X_train_w2v_vectors=get_doc_vectors(w2v_model,X_train,num_features=300)
X_test_w2v_vectors=get_doc_vectors(w2v_model,X_test,num_features=300)

In [None]:
X_train_w2v_vectors.shape, X_test_w2v_vectors.shape

In [None]:
forest=RandomForestClassifier()
forest.fit(X_train_w2v_vectors,y_train)
y_pred=forest.predict(X_test_w2v_vectors)
print(classification_report(y_test,y_pred))

# Pretrained Models

1. 1. 1. 

In [None]:
import gensim.downloader
print(gensim.downloader.info()['models'].keys())

In [None]:
glove_vectors = gensim.downloader.load('glove-twitter-25')

In [None]:
def get_pretrained_vectors(model,sentence):
    '''Get sentence vectors'''
    
    vectors=[]
    for i in sentence.split():
        try:
            vectors.append(model[i])
        except:
            continue
    return np.average(vectors,axis=0)
            
    
def get_pretrained_models(model,documents,num_features=25):
    
    # Initialize a counter
    counter = 0
    
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(documents),num_features),dtype="float32")
    
    # Loop through the reviews
    for sentence in documents:
        # Print a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d" % (counter, len(documents)))
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = get_pretrained_vectors(model,sentence)
        
        # Increment the counter
        counter = counter + 1
    return reviewFeatureVecs
    

In [None]:
X_train_w2v_vectors=get_pretrained_models(glove_vectors,X_train,num_features=25)
X_test_w2v_vectors=get_pretrained_models(glove_vectors,X_test,num_features=25)

forest=RandomForestClassifier()
forest.fit(X_train_w2v_vectors,y_train)
y_pred=forest.predict(X_test_w2v_vectors)
print(classification_report(y_test,y_pred))

# Reference
https://www.kaggle.com/aayushkubba/twitter-sentiment-analysis-word2vec-doc2vec
https://www.kaggle.com/aayushkubba/nlp-word2vec