In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install ipython-autotime
%load_ext autotime

In [None]:
df = pd.read_csv('/kaggle/input/topic-balaned-dataset/topic_balanced_aug.csv')
df

In [None]:
df['topic'].value_counts() # checking data balance

### Data preprocessing

In [None]:
import string
from gensim.parsing.preprocessing import remove_stopwords
# remove stopwords and punctuation
def preprocess_sentence(sentence):
    return remove_stopwords(sentence.lower()).translate(str.maketrans('', '', string.punctuation)).strip()

In [None]:
# testing the string preprocessing function
dic = {'title': ["Kareem-buggy sentence . , large full of punct?u@ations", 'another given?.,*^% sentence! is am are']}
pd.DataFrame(dic)['title'].apply(preprocess_sentence)

In [None]:
df['title'] = df['title'].apply(preprocess_sentence)
df['topic'] = df['topic'].apply(str.lower) 
df.head()

### Doc2vec embeddings

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [None]:
sentences = df['title'].tolist()
# Tokenization of each document
tokenized_sentences = []
for s in sentences:
    tokenized_sentences.append(word_tokenize(s.lower()))
tokenized_sentences[0]

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sentences)]
tagged_data[0:3]

In [None]:
# ## Train doc2vec model
# model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100, workers=4)

# '''
# vector_size = Dimensionality of the feature vectors.
# window = The maximum distance between the current and predicted word within a sentence.
# min_count = Ignores all words with total frequency lower than this.
# alpha = The initial learning rate.
# '''


In [None]:
# model.save('topic_doc2vec.model')

In [None]:
# load pretrained model made using cells above in some older notebook
model = Doc2Vec.load('/kaggle/input/news-topic-classification-doc2vec-model/topic_doc2vec.model')

In [None]:
#transform training data
train_vectors = []
for t in tokenized_sentences:
    train_vectors.append(model.infer_vector(t))

In [None]:
train_vectors[0:3]

In [None]:
embeddings = pd.DataFrame(train_vectors)
embeddings['topic'] = df['topic']
embeddings

In [None]:
# save embeddings to be able to use them later
embeddings.to_csv('topic_doc2vec_embeddings.csv', index=False, header=True)

In [None]:
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression()
logreg.fit(train_vectors, df['topic'])
print(
    "Logistic Regression classification accuracy on training data:\n",
    logreg.score(train_vectors,df['topic'])*100,"%")