
#### Mounting Google Drive to access data




In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Loading the data

In [3]:
import os
import pandas as pd
import numpy as np

data_dir = '/content/drive/My Drive/AML_Project/dbpedia_csv'

train_data_path = os.path.join(data_dir, 'train.csv')
test_data_path = os.path.join(data_dir, 'test.csv')

train_df = pd.read_csv(train_data_path, header=None, names=['class', 'title', 'text'])
display(train_df.head())
display(train_df.shape)
test_df = pd.read_csv(test_data_path, header=None, names=['class', 'title', 'text'])
display(test_df.head())
display(test_df.shape)

Unnamed: 0,class,title,text
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


(560000, 3)

Unnamed: 0,class,title,text
0,1,TY KU,TY KU /taɪkuː/ is an American alcoholic bever...
1,1,Odd Lot Entertainment,OddLot Entertainment founded in 2001 by longt...
2,1,Henkel,Henkel AG & Company KGaA operates worldwide w...
3,1,GOAT Store,The GOAT Store (Games Of All Type Store) LLC ...
4,1,RagWing Aircraft Designs,RagWing Aircraft Designs (also called the Rag...


(70000, 3)

#### Preprocessing text

In [4]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 

def preprocess_text(text):
    # removing numbers
    text = re.sub('[0-9]+', '', text)
    
    # removing urls
    text = re.sub(r'http\S+', '', text)
    
    # removing punctuation and special characters
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # convert to lowercase and lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # remove stop words
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    
    # remove small words
    keywords = [word for word in keywords if len(word) > 2]
    
    return keywords

train_df['preprocess_text'] = train_df.text.apply(preprocess_text)
test_df['preprocess_text'] = test_df.text.apply(preprocess_text)
train_df.head()
test_df.head()

#### Generating word embeddings

In [None]:
from gensim.models import Word2Vec

sentences = pd.concat([train_df.preprocess_text, test_df.preprocess_text], axis=0)
w2c_model = Word2Vec(sentences=sentences, size=300, min_count=1, window=5, workers=4, sg=1)
w2c_model.wv.vectors.shape

Saving the word embeddings

In [None]:
w2v_model.wv.save('/content/drive/My Drive/AML_Project/dbpedia.wordvectors')

#### Loading word embeddings

In [None]:
from gensim.models import KeyedVectors

word_vectors = KeyedVectors.load('/content/drive/My Drive/AML_Project/dbpedia.wordvectors', mmap='r')

#### Generating word vectors for text

In [None]:
def vectorize_text(text, wv):
    vec = np.zeros((1, 300))
    for w in text:
        vec += wv.get_vector(w)

    return vec / len(text)

train_df['text_vec'] = train_df.preprocess_text.apply(vectorize_text, args=(word_vectors,))
test_df['text_vec'] = test_df.preprocess_text.apply(vectorize_text, args=(wv,))  

In [None]:
train_df.text_vec

In [None]:
test_df.text_vec

In [None]:
pd.DataFrame(train_df.text_vec).to_pickle('/content/drive/My Drive/AML_Project/dbpedia_train_wv.pkl')
pd.DataFrame(test_df.text_vec).to_pickle('/content/drive/My Drive/AML_Project/dbpedia_test_wv.pkl')