In [None]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from collections import Counter
from nltk.util import ngrams

In [None]:
data = pd.read_csv('WikiLarge_Train.csv', encoding="utf-8")
data.head()

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1


In [None]:
def clean(data):
# create a new column of text data, lower case all words 
    data["cleaned_text"] = data['original_text'].str.lower()

    # remove "-LRB-", "-LRB-", non-alphabets and non-ascii characters
    data['cleaned_text'] = data['cleaned_text'].str.replace("-lrb-", '')
    data['cleaned_text'] = data['cleaned_text'].str.replace("-rrb-", '')
    data["cleaned_text"] = data["cleaned_text"].str.encode('ascii', 'ignore').str.decode('ascii')
#     data["cleaned_text"] = data["cleaned_text"].str.replace(',', '')
#     data["cleaned_text"] = data["cleaned_text"].str.replace('.', '')
#     data["cleaned_text"] = data["cleaned_text"].str.replace('` `', '')

    # remove space >1
    data["cleaned_text"] = data["cleaned_text"].str.replace('[\s]{2,}', ' ')

    # remove space at the begining and the end of each sentence
    data["cleaned_text"] = data["cleaned_text"].str.strip()
    return data

In [None]:
# cleaned data
data = clean(data)
data.head()

Unnamed: 0,original_text,label,cleaned_text
0,There is manuscript evidence that Austen conti...,1,there is manuscript evidence that austen conti...
1,"In a remarkable comparative analysis , Mandaea...",1,"in a remarkable comparative analysis , mandaea..."
2,"Before Persephone was released to Hermes , who...",1,"before persephone was released to hermes , who..."
3,Cogeneration plants are commonly found in dist...,1,cogeneration plants are commonly found in dist...
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"geneva , ; , ; , ; ; is the second-most-populo..."


In [None]:
# nltk.word_tokenize(data.iloc[1,2])

In [None]:
data['token_text'] = data["cleaned_text"].apply(lambda x: nltk.word_tokenize(x))
data.head()

Unnamed: 0,original_text,label,cleaned_text,token_text
0,There is manuscript evidence that Austen conti...,1,there is manuscript evidence that austen conti...,"[there, is, manuscript, evidence, that, austen..."
1,"In a remarkable comparative analysis , Mandaea...",1,"in a remarkable comparative analysis , mandaea...","[in, a, remarkable, comparative, analysis, ,, ..."
2,"Before Persephone was released to Hermes , who...",1,"before persephone was released to hermes , who...","[before, persephone, was, released, to, hermes..."
3,Cogeneration plants are commonly found in dist...,1,cogeneration plants are commonly found in dist...,"[cogeneration, plants, are, commonly, found, i..."
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"geneva , ; , ; , ; ; is the second-most-populo...","[geneva, ,, ;, ,, ;, ,, ;, ;, is, the, second-..."


In [None]:
def lemmatization(row):
    #make sure nltk.download('stopwords') and nltk.download('wordnet') work 
    stop = list(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    token_words= word_tokenize(row)
    token_words = [word for word in token_words if not word in stop]
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(lemmatizer.lemmatize(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [None]:
data["length"] = data['cleaned_text'].apply(lambda x: len(lemmatization(x).strip().split(" ")))

In [None]:
data.head()

Unnamed: 0,original_text,label,cleaned_text,token_text,length
0,There is manuscript evidence that Austen conti...,1,there is manuscript evidence that austen conti...,"[there, is, manuscript, evidence, that, austen...",25
1,"In a remarkable comparative analysis , Mandaea...",1,"in a remarkable comparative analysis , mandaea...","[in, a, remarkable, comparative, analysis, ,, ...",17
2,"Before Persephone was released to Hermes , who...",1,"before persephone was released to hermes , who...","[before, persephone, was, released, to, hermes...",23
3,Cogeneration plants are commonly found in dist...,1,cogeneration plants are commonly found in dist...,"[cogeneration, plants, are, commonly, found, i...",34
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"geneva , ; , ; , ; ; is the second-most-populo...","[geneva, ,, ;, ,, ;, ,, ;, ;, is, the, second-...",19


In [None]:
def create_ngram(n, token_row):
    ngram = []
    ngram.extend((ngrams(token_row, n)))
    return ngram

In [None]:
data['bigram'] = data['token_text'].apply(lambda x: create_ngram(2,x))

In [None]:
# data.iloc[:3,3].apply(lambda x: x.extend(create_ngram(2,x)))

In [None]:
data.head()

Unnamed: 0,original_text,label,cleaned_text,token_text,length,bigram
0,There is manuscript evidence that Austen conti...,1,there is manuscript evidence that austen conti...,"[there, is, manuscript, evidence, that, austen...",25,"[(there, is), (is, manuscript), (manuscript, e..."
1,"In a remarkable comparative analysis , Mandaea...",1,"in a remarkable comparative analysis , mandaea...","[in, a, remarkable, comparative, analysis, ,, ...",17,"[(in, a), (a, remarkable), (remarkable, compar..."
2,"Before Persephone was released to Hermes , who...",1,"before persephone was released to hermes , who...","[before, persephone, was, released, to, hermes...",23,"[(before, persephone), (persephone, was), (was..."
3,Cogeneration plants are commonly found in dist...,1,cogeneration plants are commonly found in dist...,"[cogeneration, plants, are, commonly, found, i...",34,"[(cogeneration, plants), (plants, are), (are, ..."
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"geneva , ; , ; , ; ; is the second-most-populo...","[geneva, ,, ;, ,, ;, ,, ;, ;, is, the, second-...",19,"[(geneva, ,), (,, ;), (;, ,), (,, ;), (;, ,), ..."


In [None]:
# train dataset 80%, test dataset 20%
X = data[['token_text','bigram',"length"]]
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

(333414, 3)

In [None]:
X_train

Unnamed: 0,token_text,bigram,length
304501,"[1979-80, buffalo, sabres, nhl, 32, 1880, 74, ...","[(1979-80, buffalo), (buffalo, sabres), (sabre...",15
162313,"[diseases, lentils, in, culture, lentils, are,...","[(diseases, lentils), (lentils, in), (in, cult...",25
336845,"[railroads, ,, like, the, lehigh, valley, rail...","[(railroads, ,), (,, like), (like, the), (the,...",19
150625,"[an, example, of, this, would, be, an, individ...","[(an, example), (example, of), (of, this), (th...",21
40240,"[both, the, matanuska, and, susitna, rivers, h...","[(both, the), (the, matanuska), (matanuska, an...",8
...,...,...,...
259178,"[after, the, germans, invaded, norway, in, apr...","[(after, the), (the, germans), (germans, invad...",12
365838,"[july, 28, -, henry, bennet, ,, 1st, earl, of,...","[(july, 28), (28, -), (-, henry), (henry, benn...",14
131932,"[pancake, restaurants, are, popular, family, r...","[(pancake, restaurants), (restaurants, are), (...",15
146867,"[a, cycling, domestique]","[(a, cycling), (cycling, domestique)]",2


In [None]:
combined_X_train = X_train["token_text"] + X_train['bigram']
combined_X_train.head()

304501    [1979-80, buffalo, sabres, nhl, 32, 1880, 74, ...
162313    [diseases, lentils, in, culture, lentils, are,...
336845    [railroads, ,, like, the, lehigh, valley, rail...
150625    [an, example, of, this, would, be, an, individ...
40240     [both, the, matanuska, and, susitna, rivers, h...
dtype: object

In [None]:
num_features = 250    
min_word_count = 3    
num_workers = 2       
context = 5           
downsampling = 1e-4 

model = Word2Vec(combined_X_train, vector_size=num_features, sg=1, hs=0,workers=num_workers, min_count=min_word_count,
                window=context, sample=downsampling, negative=5)
print(model)

Word2Vec(vocab=435448, vector_size=250, alpha=0.025)


In [None]:
vocab = model.wv.index_to_key
print(len(vocab))

435448


In [None]:
# vocab

In [None]:
# ((model.wv[('in', 'the')][0] + model.wv[('in', 'the')][1])/2).shape

In [None]:
# model.vector_size

In [None]:
def sen_to_vect(row, model):
    nwords = 0
    sent_vector = np.zeros(model.vector_size, dtype="float32")
    vocab = model.wv.index_to_key
    # Sum up all words vectors that are know to the model
    for word in row:
        if word in vocab:
#             bi = (model.wv[word][0] + model.wv[word][1]) / 2
            sent_vector += model.wv[word]
            nwords +=1

    # Now get the average
    if nwords > 0:
        sent_vector /= nwords
    return sent_vector

In [None]:
sen_to_vect(X_train["token_text"][0], model).shape

(250,)

In [None]:
def sen_to_vect_ngram(row, model):
    nwords = 0
    sent_vector = np.zeros(model.vector_size, dtype="float32")
    vocab = model.wv.index_to_key
    # Sum up all words vectors that are know to the model
    for word in row:
        if word in vocab:
            bi = (model.wv[word][0] + model.wv[word][1]) / 2
            sent_vector += bi
            nwords +=1

    # Now get the average
    if nwords > 0:
        sent_vector /= nwords
    return sent_vector

In [None]:
sen_to_vect_ngram(X_train["bigram"][0], model).shape

(250,)

In [None]:
w2v_df = pd.DataFrame()
w2v_df['sentence_vectors'] = X_train["token_text"].apply(lambda x: sen_to_vect(x, model))
# w2v_df['sentence_vectors_bigram'] = X_train["bigram"].apply(lambda x: sen_to_vect_ngram(x, model))

KeyboardInterrupt: 

In [None]:
w2v_df['sentence_vectors_bigram'] = X_train["bigram"].apply(lambda x: sen_to_vect_ngram(x, model))

In [None]:
w2v_df.size

In [None]:
w2v_df.head()

In [None]:
index=[]
for i in range(model.vector_size):
    w2v_df[f'w2v_{i}'] = w2v_df['sentence_vectors'].apply(lambda x: x[i])
    index.append(f'w2v_{i}') 

In [None]:
# w2v_df.head()

In [None]:
w2v_df.shape

In [None]:
w2v_df_train = w2v_df.iloc[:,1:]
w2v_df_train.head()

In [None]:
# process the test data
w2v_df_test = pd.DataFrame()
w2v_df_test['sentence_vectors'] = X_test["token_text"].apply(lambda x: sen_to_vect(x, model))

In [None]:
index=[]
for i in range(model.vector_size):
    w2v_df_test[f'w2v_{i}'] = w2v_df_test['sentence_vectors'].apply(lambda x: x[i])
    index.append(f'w2v_{i}')

In [None]:
w2v_df_test.shape

In [None]:
w2v_df_test = w2v_df_test.iloc[:,1:]
w2v_df_test.head()

In [None]:
lr = LogisticRegression(max_iter=10000)
lr.fit(w2v_df_train, y_train)
y_lr_pred_text = lr.predict(w2v_df_test)
lr_tfidf_text = accuracy_score(y_test, y_lr_pred_text)
lr_tfidf_text

In [None]:
# X_train["length"]

In [None]:
# X_test["length"].shape

In [None]:
# w2v_df_train["length"] = X_train["length"].values

In [None]:
# w2v_df_test["length"] = X_test["length"].values

In [None]:
# w2v_df_train.head()

In [None]:
# lr = LogisticRegression(max_iter=10000)
# lr.fit(w2v_df_train, y_train)
# y_lr_pred_text = lr.predict(w2v_df_test)
# lr_tfidf_text = accuracy_score(y_test, y_lr_pred_text)
# lr_tfidf_text

In [None]:
# svc = LinearSVC(max_iter=10000)
# svc.fit(w2v_df_train, y_train)
# y_svc_pred_svc = svc.predict(w2v_df_test)
# svc_w2v_text = accuracy_score(y_test, y_svc_pred_svc)
# svc_w2v_text

In [None]:
text = ['cant railway station','citadel hotel',' police stn']

In [None]:
bigram = []
for line in text:
    token = data.iloc[0,3]
    bigram.extend((ngrams(token, 2)))

In [None]:
bigram

In [None]:
# define training data
# sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],['this', 'is', 'the', 'second', 'sentence'],
#             ['yet', 'another', 'sentence'], ['one', 'more', 'sentence'],['and', 'the', 'final', 'sentence']]

In [None]:
# train model
# model1 = Word2Vec(sentences, min_count=1)
# summarize the loaded model
# print(model1)

In [None]:
# summarize vocabulary
# model_voc = model1.wv.index_to_key
# print(model_voc)

In [None]:
# access vector for one word
# print(model1.wv['the'].size)
# print(model1.wv['the'])