## Mounting Google Drive to access data




In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Loading the data

In [3]:
DATA_DIR = '/content/drive/My Drive/AML_Project/dbpedia_csv'
BIN_DIR = '/content/drive/My Drive/AML_Project/binaries'

In [4]:
import os
import pandas as pd
import numpy as np

train_data_path = os.path.join(DATA_DIR, 'train.csv')
test_data_path = os.path.join(DATA_DIR, 'test.csv')

train_df = pd.read_csv(train_data_path, header=None, names=['label', 'title', 'text'])
print(train_df.shape)
test_df = pd.read_csv(test_data_path, header=None, names=['label', 'title', 'text'])
print(test_df.shape)

(560000, 3)
(70000, 3)


# Reducing the size of the data
We are reducing the dataset size to reduce the compute intensive nature of the algorithms that we are going to be working with.
* Number of classes = 3 (reduced from 14 in the entire dataset)
* Training size = 7.5k data points (2.5k for each class)
* Testing size = 2.25k data points (750 for each class


In [None]:
num_classes = 3
train_size_per_class = 2500
test_size_per_class = 750

cols = ['label', 'text']

def reduce_dataframe(df, num_per_class):
    reduce_df = pd.DataFrame(columns=cols)
    labels = np.unique(df.label)[:num_classes]
    for l in labels:
        l_idx = np.where(df.label == l)[0]
        small_idx = random.sample(l_idx.tolist(), num_per_class)
        small_df = pd.DataFrame(df[cols].iloc[small_idx])
        reduce_df = reduce_df.append(small_df)

    return reduce_df

small_train_df = reduce_dataframe(train_df, train_size_per_class)
small_test_df = reduce_dataframe(test_df, test_size_per_class)

small_train_df.to_csv(os.path.join(DATA_DIR, 'small_train.csv'), index=False, header=False)
small_test_df.to_csv(os.path.join(DATA_DIR, 'small_test.csv'), index=False, header=False)

# Preprocessing text

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 

def preprocess_text(text):
    # removing numbers
    text = re.sub('[0-9]+', '', text)
    
    # removing urls
    text = re.sub(r'http\S+', '', text)
    
    # removing punctuation and special characters
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # convert to lowercase and lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # remove stop words
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    
    # remove small words
    keywords = [word for word in keywords if len(word) > 2]
    
    return keywords

small_train_df['preprocess_text'] = small_train_df.text.apply(preprocess_text)
small_test_df['preprocess_text'] = small_test_df.text.apply(preprocess_text)

Saving the preprocessed text to pkl file

In [None]:
small_train_df.to_pickle(os.path.join(BIN_DIR, 'small_train_preprocessed.pkl'))
small_test_df.to_pickle(os.path.join(BIN_DIR, 'small_test_preprocessed.pkl'))

# Generating word embeddings

In [None]:
from gensim.models import Word2Vec

all_text = pd.concat([small_train_df.preprocess_text, small_test_df.preprocess_text], axis=0)
w2v_model = Word2Vec(sentences=all_text, size=300, min_count=1, window=5, workers=4, sg=1)
w2v_model.wv.vectors.shape

(37424, 300)

Saving the word embeddings

In [None]:
w2v_model.wv.save(os.path.join(BIN_DIR, 'small_dbpedia.wordembeddings'))

# Generating word vectors for text
Loading word embeddings

In [5]:
import os
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors

small_train_df = pd.read_pickle(os.path.join(BIN_DIR, 'small_train_preprocessed.pkl'))
small_test_df = pd.read_pickle(os.path.join(BIN_DIR, 'small_test_preprocessed.pkl'))

word_vectors = KeyedVectors.load(os.path.join(BIN_DIR, 'small_dbpedia.wordembeddings'), mmap='r')

def vectorize_text(text, wv):
    vec = np.zeros((1, 300))
    for w in text:
        vec += wv.get_vector(w)

    return vec / len(text)

small_train_df['text_vec'] = small_train_df.preprocess_text.apply(vectorize_text, args=(word_vectors,))
small_test_df['text_vec'] = small_test_df.preprocess_text.apply(vectorize_text, args=(word_vectors,)) 

In [6]:
train_vec = pd.DataFrame(small_train_df.text_vec.explode().tolist())
test_vec = pd.DataFrame(small_test_df.text_vec.explode().tolist())

In [22]:
pd.DataFrame(train_vec).to_pickle(os.path.join(BIN_DIR, 'dbpedia_train_all_x.pkl'))
pd.DataFrame(small_train_df.label).to_pickle(os.path.join(BIN_DIR, 'dbpedia_train_all_y.pkl'))

train_vec['label'] = small_train_df.label.values
test_vec['label'] = small_test_df.label.values
train_vec.to_pickle(os.path.join(BIN_DIR, 'dbpedia_train_wv.pkl'))
test_vec.to_pickle(os.path.join(BIN_DIR, 'dbpedia_test_wv.pkl'))

# Generating label splits
We're generating multiple datasets with different proportions of the training data that is labelled and unlabelled. We will be using these splits to benchmark the performance of our semi-supervised learning model:
* Label Split 1: 125 data pts / class (5% of dataset)
* Label Split 2: 75 data pts / class (3% dataset)

In [None]:
num_per_class_split1 = 125
num_per_class_split2 = 75

In [None]:
import random

def generate_label_splits_pkl(train_df, train_vec):
    """
    Function to generate the label splits
    """
    # final dataframes
    x_split1_df, y_split1_df = pd.DataFrame(), pd.DataFrame()
    x_split2_df, y_split2_df = pd.DataFrame(), pd.DataFrame()

    labels = np.unique(train_df.label)
    for l in labels:
        l_idx = np.where(train_df.label == l)[0]
        # generating split 1
        split1_idx = random.sample(l_idx.tolist(), num_per_class_split1)
        x_split1_df = x_split1_df.append(pd.DataFrame(train_vec.iloc[split1_idx]))
        y_split1_df = y_split1_df.append(pd.DataFrame(train_df.label.iloc[split1_idx]))

        # generating split 2
        split2_idx = random.sample(l_idx.tolist(), num_per_class_split2)
        x_split2_df = x_split2_df.append(pd.DataFrame(train_vec.iloc[split2_idx]))
        y_split2_df = y_split2_df.append(pd.DataFrame(train_df.label.iloc[split2_idx]))

    x_split1_df.to_pickle(os.path.join(BIN_DIR, 'dbpedia_train_x_split1.pkl'))
    y_split1_df.to_pickle(os.path.join(BIN_DIR, 'dbpedia_train_y_split1.pkl'))
    x_split2_df.to_pickle(os.path.join(BIN_DIR, 'dbpedia_train_x_split2.pkl'))
    y_split2_df.to_pickle(os.path.join(BIN_DIR, 'dbpedia_train_y_split2.pkl'))


In [None]:
small_train_df = pd.read_csv(os.path.join(DATA_DIR, 'small_train.csv'), names=['label', 'text'])
train_vec = pd.read_pickle(os.path.join(BIN_DIR, 'dbpedia_train_all_x.pkl'))
generate_label_splits_pkl(small_train_df, train_vec)

In [None]:
import random
import numpy as np
from sklearn.model_selection import train_test_split

def generate_label_splits_csv(df):
    """
    Function to generate the label splits
    """
    # final dataframes
    cols=['label', 'text']
    split1_df = pd.DataFrame(columns=cols)
    train_split1_df = pd.DataFrame(columns=cols)
    validation_split1_df = pd.DataFrame(columns=cols)

    split2_df = pd.DataFrame(columns=cols)
    train_split2_df = pd.DataFrame(columns=cols)
    validation_split2_df = pd.DataFrame(columns=cols)

    labels = np.unique(df.label)
    for l in labels:
        l_idx = np.where(df.label == l)[0]
        # generating split 1
        split1_idx = random.sample(l_idx.tolist(), num_per_class_split1)
        split1_df = pd.DataFrame(df[cols].iloc[split1_idx])
        train, validation = train_test_split(split1_df)
        train_split1_df = train_split1_df.append(train)
        validation_split1_df = validation_split1_df.append(validation)
        
        # generating split 2
        split2_idx = random.sample(l_idx.tolist(), num_per_class_split2)
        split2_df = pd.DataFrame(df[cols].iloc[split2_idx])
        train, validation = train_test_split(split2_df)
        train_split2_df = train_split2_df.append(train)
        validation_split2_df = validation_split2_df.append(validation)

    train_split1_df.to_csv(os.path.join(DATA_DIR, 'dbpedia_train_split1.csv'), index=False, header=False)
    validation_split1_df.to_csv(os.path.join(DATA_DIR, 'dbpedia_validation_split1.csv'), index=False, header=False)
    train_split2_df.to_csv(os.path.join(DATA_DIR, 'dbpedia_train_split2.csv'), index=False, header=False)
    validation_split2_df.to_csv(os.path.join(DATA_DIR, 'dbpedia_validation_split2.csv'), index=False, header=False)



In [None]:
small_train_df = pd.read_csv(os.path.join(DATA_DIR, 'small_train.csv'), header=None, names=['label', 'text'])
generate_label_splits_csv(small_train_df)