In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import our data

In [None]:
import pandas as pd

data = pd.read_csv('/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv')
data.head()

In [None]:
# Let's split our data 
# raw data
test = data.iloc[0:100,:]
train = data.iloc[100:3082,:]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk

def preprocessor(content):
    content = content.lower()
    content = re.sub(r'[^\w]', ' ', content)
    _stemmer = nltk.stem.porter.PorterStemmer()
    stopword = nltk.corpus.stopwords.words('english')
    token = nltk.tokenize.word_tokenize(content)
    new_content = ""
    for x in token:
        if x not in stopword:
            new_content += _stemmer.stem(x)+' '
    return new_content[:-1]

def transform(data,vectorizer = None):
    if not vectorizer:
        vectorizer = TfidfVectorizer(preprocessor=preprocessor, tokenizer=nltk.tokenize.word_tokenize)
        processedData = vectorizer.fit_transform(data)
    else:
        processedData = vectorizer.transform(data)
    return processedData, vectorizer

train_data, vectorizer = transform(train['text'])

# LSA Model

In [None]:
from sklearn.decomposition import TruncatedSVD

# our LSA model
lsa_model = TruncatedSVD(n_components=15)
lsa_model.fit(train_data)

In [None]:
# Show important terms of each latent topic
print('important terms of each latent topic')
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:15]
    print("Topic "+str(i)+": ",end="")
    for t in sorted_terms:
        print(t[0],end=" ")
    print("")

In [None]:
#Show 5 documents of each latent topic
def print_samples(y,data):
    for level in set(y):
        print('Topic :',level)
        try:
            sample_data = data.iloc[y == level].sample(n=5)['text']
        except:
            sample_data = data.iloc[y == level].content.values
        for sample in sample_data:
            print('\t-',sample)
        print('-----------------------------\n\n')

In [None]:
topic_matrix = lsa_model.transform(train_data)
y = np.argmax(topic_matrix,axis=1)
print('5 training sample documents from each latent topic')
print_samples(y,train)