## Natural Language Processing - Word Embedding

In [None]:
import warnings
warnings.filterwarnings('ignore')
import os

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from bs4 import BeautifulSoup

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.downloader import download
import re

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from keras.layers import Embedding

from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

## Different Approaches

## Training own word2vec Model

In [None]:
data = { 'comments': ['Social media can help people connect with others, learn, and be creative',
         ' It can also be a way to manage social anxiety and access support',        
         'Social media can also be a way for businesses to promote their products',
         'Social media can lead to addiction, isolation, and poor mental health',
         'It can also expose people to inappropriate content, cyberbullying, and privacy and data breaches',
         'Social media can also lead to unhealthy comparisons and unrealistic body image'],
        'class': [1,1,1,0,0,0]
        }
df = pd.DataFrame(data=data , columns=[ 'comments','class'])
df

In [None]:
stop_words = set(stopwords.words('english'))
lemm = WordNetLemmatizer()
def clean_data(x):
    x_text = BeautifulSoup(x).getText()
    x_text = x_text.lower()
    x_text = re.sub('[^A-Za-z]',' ',x_text)
    x_tokens = word_tokenize(x_text)
    x_tokens = [word for word in x_tokens if word not in stop_words]
    x_tokens = [lemm.lemmatize(word) for word in x_tokens]
    return x_tokens

In [None]:
df['token'] = df['comments'].apply(clean_data)
df

In [None]:
w2v_model = Word2Vec(df['token'],min_count=1)
print(w2v_model)
print(w2v_model.wv.key_to_index.keys()) #vocabulary

In [None]:
w2v_model.wv.most_similar('social')

In [None]:
#error
#w2v_model.wv.most_similar('hello')

## Create Embedding using Keras

In [None]:
len(set(df['comments'].str.cat(sep=' ').lower().split()))

In [None]:
n = len(set(df['comments'].str.cat(sep=' ').lower().split()))
encoded_ss = [one_hot(sent,n) for sent in df['comments']]
encoded_ss

In [None]:
length = len(max(encoded_ss))
padding = pad_sequences(encoded_ss,maxlen=length,padding='pre')
padding

In [None]:
seq_model = Sequential()
seq_model.add(Embedding(n,8,input_length=length))
seq_model.add(Flatten())
seq_model.add(Dense(1,activation='sigmoid'))
seq_model.summary()

In [None]:
seq_model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
seq_model.fit(padding,df['class'],epochs=10)

In [None]:
loss, accuracy = seq_model.evaluate(padding,df['class'])

In [None]:
predictions = seq_model.predict(padding)
y_pred = (predictions>0.5).astype(int)
y_pred

In [None]:
new_sentences = ['Building Relationships and connect, also to promote the products','Social media can cause sleeplessness,security breach']

new_encoded_ss = [one_hot(sent,n ) for sent in new_sentences]
new_encoded_ss

In [None]:
new_padding = pad_sequences(new_encoded_ss ,maxlen=length,padding='pre')
new_padding

In [None]:
predictions = seq_model.predict(new_padding)
y_pred = (predictions>0.5).astype(int)
y_pred

## Using Pretrained Model

## Import Data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
cols = ['id','entity','target','tweets']
data = pd.read_csv('/kaggle/input/twitter_training.csv', names=cols)
df = pd.DataFrame(data)
df.head(5)

In [None]:
cols = ['id','entity','target','tweets']
data = pd.read_csv('/kaggle/input/twitter_validation.csv', names=cols)
df_test = pd.DataFrame(data)
df_test.head(5)

In [None]:
df.info(),df_test.info()

In [None]:
df.isna().sum(),df_test.isna().sum()

In [None]:
df.dropna(inplace=True),df_test.dropna(inplace=True)

In [None]:
stop_words = set(stopwords.words('english'))
word_lemm = WordNetLemmatizer()
def preprocess_data(x):
    x_text = BeautifulSoup(x).getText()   
    x_lower = x_text.lower()
    x_spl = re.sub("[^a-zA-Z]", " ", x_lower)  
    x_tokens = word_tokenize(x_spl)
    x_words = [w for w in x_tokens if not w in stop_words] 
    return x_words

In [None]:
def preprocess_sent(data): 
    raw_sent = nltk.sent_tokenize(data.strip())        
    sentences = [preprocess_data(sent) for sent in raw_sent if len(sent) > 0]    
    return sentences

In [None]:
sentence = []
for tweets in df['tweets']:   
    sentence += preprocess_sent(tweets) 
    #print(preprocess_sent(tweets) )

Worker threads : Number of parallel processes to run. One thing to remember here is that unlike sklearn, it does not accept -1 option to use all the processors.

Downsampling of frequent words : According to the Google documentation, values between 0.00001 and 0.001 would suffice.

Context : How many words around the target word will be used?

Minimum word count: This helps limit the size of the vocabulary to meaningful words. Any word that does not occur at least this many times across all documents is ignored. Reasonable values could be between 10 and 100. The reason why I chose 40 is that there are 30 reviews in each movie and repeat the title 30 times; therefore in order to avoid attaching too much importance to individual movie titles, I set to 40.

In [None]:
num_features = 250
min_count = 40
num_processor = 4
context = 10
downsampling = 0.001

In [None]:
# workers = num_processor, 
# vector_size = num_features, min_count = min_count,
# window = context, sample = downsampling

In [None]:
w2v_model = Word2Vec(sentence)

In [None]:
w2v_model.init_sims(replace=True)

In [None]:
model_name = '250features_40minwords_20context'
w2v_model.save(model_name)

In [None]:
w2v_model.wv.most_similar('borders')

In [None]:
sample = df['tweets'][0]
print('Sample1')
print('Original:',df['tweets'][0])
print('\nProcessed Sentence:',preprocess_sent(sample))

sample = df['tweets'][6]
print('\nSample2')
print('Original:',df['tweets'][6])
print('\nProcessed Sentence:',preprocess_sent(sample))

In [None]:
df['words'] = df['tweets'].apply(preprocess_data)
df_test['words'] = df_test['tweets'].apply(preprocess_data)

In [None]:
def getavgvec(words, model, vector_dim):
   
    featureVec = np.zeros((vector_dim,),dtype='float32')
   
    number_of_words = 0.
    
    index2word_set = set(model.wv.index_to_key)
        
    for word in words:
        if word in index2word_set: 
            number_of_words = number_of_words + 1.
            featureVec = np.add(featureVec,model.wv.get_vector(word))
    
    featureVec = np.divide(featureVec,number_of_words) #average
    return featureVec         

In [None]:
df['avg_vec'] = df['words'].apply(lambda x: getavgvec(words=x,model=w2v_model,vector_dim=w2v_model.vector_size))
df_test['avg_vec'] = df_test['words'].apply(lambda x: getavgvec(words=x,model=w2v_model,vector_dim=w2v_model.vector_size))

In [None]:
df.head(5)

In [None]:
avg_feature_vec = np.array(list(df['avg_vec']))
avg_feature_vec = np.nan_to_num(avg_feature_vec)
np.isnan(avg_feature_vec).any()

test_avg_feature_vec = np.array(list(df_test['avg_vec']))
test_avg_feature_vec = np.nan_to_num(test_avg_feature_vec)
np.isnan(test_avg_feature_vec).any()

In [None]:
encoder = LabelEncoder()
df['target_encoded'] = encoder.fit_transform( df['target'])
df_test['target_encoded'] = encoder.transform( df_test['target'])

In [None]:
kfold = StratifiedKFold( n_splits = 5 )
rfc=RandomForestClassifier(random_state=42)

param_grid = { 
        'n_estimators': [200,400,800],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,8,16,32],
    'criterion' :['gini', 'entropy']
}
gs_rfc = GridSearchCV(rfc, param_grid = [param_grid], verbose = 1, cv = kfold, n_jobs = -1, scoring = 'roc_auc' )
gs_rfc.fit(avg_feature_vec, df['target_encoded'])
gs_rfc_best = gs_rfc.best_estimator_
print(gs_rfc.best_params_)

In [None]:
gs_rfc.predict(test_avg_feature_vec)
print(gs_rfc.best_score_)