In [1]:
import pickle
import pandas as pd
import numpy as np
import pickle
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import itertools
from collections import Counter
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.linear_model import LogisticRegression
import os
import string

import os
import numpy as np
from gensim.models import word2vec
import re


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data_path = "/home/jovyan/git/machine_learning/"

In [3]:
# Load the preprocessed DataFrame from a Pickle file
with open('df_train_preprocessed.pkl', 'rb') as file:
    df_train = pickle.load(file)

with open('df_test_preprocessed.pkl', 'rb') as file:
    df_test = pickle.load(file)

In [4]:
def build_vocab(sentences):
    word_counts = Counter(itertools.chain(*sentences))
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [5]:
def get_embeddings(inp_data, vocabulary_inv, size_features=150,
                   mode='skipgram',
                   min_word_count=2,
                   context=9):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 15  # Number of threads to run in parallel
    downsampling = 1e-3  # Downsample setting for frequent words
    print('Training Word2Vec model...')
    # use inp_data and vocabulary_inv to reconstruct sentences
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == 'skipgram':
        sg = 1
        print('Model: skip-gram')
    elif mode == 'cbow':
        sg = 0
        print('Model: CBOW')
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        sg=sg,
                                        vector_size=size_features,
                                        min_count=min_word_count,
                                        window=context,
                                        sample=downsampling)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model.wv:
            embedding_weights[i] = embedding_model.wv[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25,
                                                     embedding_model.vector_size)
    return embedding_weights

In [6]:
# tokenization
tagged_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
# build vocabulary from tokenized data
word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_data)
# use the above mapping to create input data
inp_data = [[vocabulary[word] for word in text] for text in tagged_data]

In [7]:
embedding_weights_2vec = get_embeddings(inp_data, vocabulary_inv)

Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding


In [8]:
tagged_train_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
tagged_test_data = [word_tokenize(_d) for i, _d in enumerate(df_test["text"])]

train_vec = []
for doc in tagged_train_data:
    vec = 0
    for w in doc:
        vec += embedding_weights_2vec [vocabulary[w]]
    vec = vec / len(doc)
    train_vec.append(vec)

import random
labels = df_train["label"].tolist()
combined = list(zip(train_vec, labels))

random.shuffle(combined)
shuffled_train_vec, shuffled_labels = zip(*combined)
split_size = int(len(shuffled_train_vec) * 0.8)
train_vec1 = shuffled_train_vec[:split_size]
valid_vec1 = shuffled_train_vec[split_size:]

train_labels1 = shuffled_labels[:split_size]
valid_labels1 = shuffled_labels[split_size:]

test_vec = []
for doc in tagged_test_data:
    vec = 0
    length = 0
    for w in doc:
        try:
            vec += embedding_weights_2vec [vocabulary[w]]
            length += 1
        except:
            continue
    vec = vec / length
    test_vec.append(vec)


In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

logistic_regression_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic_regression', LogisticRegression(
        penalty='l1', 
        C=1,  
        solver='saga', 
        max_iter=10000
    ))
])

clf = logistic_regression_pipeline.fit(train_vec1, train_labels1)
preds_valid = clf.predict(valid_vec1)

In [19]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

xgboost_pipeline = Pipeline([
    ('xgboost', XGBClassifier(
        objective='multi:softmax',  
        num_class=10,  
        eval_metric='mlogloss',  
        use_label_encoder=False,  
        learning_rate=0.1,  
        n_estimators=100,  
        max_depth=6,  
        seed=42  
    ))
])

In [20]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels1)
valid_labels_encoded = label_encoder.transform(valid_labels1) 

clf = xgboost_pipeline.fit(train_vec1, train_labels_encoded)
preds_valid_encoded = clf.predict(valid_vec1)

preds_valid = label_encoder.inverse_transform(preds_valid_encoded)


In [27]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

accuracy = accuracy_score(valid_labels1, preds_valid)
print(f'Accuracy: {accuracy}')

pred_probs = clf.predict_proba(valid_vec1)
auc_score = roc_auc_score(valid_labels1, pred_probs, multi_class='ovr')
print(f'AUC Score: {auc_score}')

f1_macro = f1_score(valid_labels1, preds_valid, average='macro')
f1_micro = f1_score(valid_labels1, preds_valid, average='micro')
f1_weighted = f1_score(valid_labels1, preds_valid, average='weighted')

print(f'F1 Score (Macro): {f1_macro}')
print(f'F1 Score (Micro): {f1_micro}')
print(f'F1 Score (Weighted): {f1_weighted}')


Accuracy: 0.7923164701407379
AUC Score: 0.9630866553140578
F1 Score (Macro): 0.7268754524809894
F1 Score (Micro): 0.7923164701407379
F1 Score (Weighted): 0.783039664084486


Trial1: Using size_features=150, C = 100 , the accuracy score is 0.779

Trial2: Using size_features=150, C = 1 , the accuracy score is 0.784


# Fit the whole training set and predict the test set

In [26]:
clf = logistic_regression_pipeline.fit(train_vec, df_train["label"])
preds = clf.predict(test_vec)

In [27]:
# in your implemetation, create the output file using the same format
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(preds):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(data_path + "predicted_Cl2saga_3.csv", index=False)