In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD
# If nltk stop word is not downloaded
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vaishnavi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# List of documents
a1 = " Peace to all politics"
a2 = "Pray for the world."
a3 = "Way to go President Trump."
a4="the dog is too lazy"
a5 = "Keep those Dems guessing how you do such a great job.."
a6 = "Great job."
a7 = "Now if only those God forsaken protesters."


df = pd.DataFrame()
df["documents"] = [a1,a2,a3,a4,a5,a6,a7]
df.head()

Unnamed: 0,documents
0,Peace to all politics
1,Pray for the world.
2,Way to go President Trump.
3,the dog is too lazy
4,Keep those Dems guessing how you do such a gre...


In [40]:
# Preprocessing
df['clean_documents'] = df['documents'].str.replace("[^a-zA-Z#]", " ")
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower())

df.head()

Unnamed: 0,documents,clean_documents
0,Peace to all politics,peace all politics
1,Pray for the world.,pray for the world
2,Way to go President Trump.,way president trump
3,the dog is too lazy,the dog too lazy
4,Keep those Dems guessing how you do such a gre...,keep those dems guessing how you such great job


In [28]:
# tokenization
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

# remove stop-words
itemstop_words= stopwords.words('english')
tokenized_doc = tokenized_doc.apply(lambda x: [itemstop_words for item in x if item not in stop_words])
#df['clean_documents']=df['clean_documents'].apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc


TypeError: sequence item 0: expected str instance, list found

In [43]:
# tokenization
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

# remove stop-words
stop_words = stopwords.words('english')
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc


In [44]:
df.head()

Unnamed: 0,documents,clean_documents
0,Peace to all politics,peace politics
1,Pray for the world.,pray world
2,Way to go President Trump.,way president trump
3,the dog is too lazy,dog lazy
4,Keep those Dems guessing how you do such a gre...,keep dems guessing great job


In [45]:
# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])
X.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.70710678, 0.70710678,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.70710678],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.57735027, 0.        , 0.57735027, 0.57735027,
        0.        ],
       [0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.54408243, 0.        , 0.        , 0.        , 0.45163515,
        0.54408243, 0.45163515, 

In [47]:
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)

In [48]:
#Documents - Topic vector
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']
display(topic_encoded_df[["documents", "topic_1", "topic_2"]])

Unnamed: 0,documents,topic_1,topic_2
0,peace politics,0.0,-0.0
1,pray world,1e-16,0.3012476650293705
2,way president trump,-3e-16,-3.123474518e-07
3,dog lazy,8e-16,0.6790259979763361
4,keep dems guessing great job,0.9051819011547404,-5e-16
5,great job,0.90518190115474,-7e-16
6,god forsaken protesters,8e-16,0.6694576449533584


In [49]:
# Features or words used as features 
dictionary = vectorizer.get_feature_names()

In [50]:
dictionary

['dems',
 'dog',
 'forsaken',
 'god',
 'great',
 'guessing',
 'job',
 'lazy',
 'peace',
 'politics',
 'pray',
 'president',
 'protesters',
 'trump',
 'way',
 'world']

In [51]:
# Term-Topic matrix
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T

In [52]:
encoding_matrix

Unnamed: 0,topic_1,topic_2
dems,0.3005376231205431,-2e-16
dog,5e-16,0.48014388777103
forsaken,4e-16,0.3865115515248743
god,4e-16,0.3865115515248742
great,0.640060260513848,-7e-16
guessing,0.3005376231205429,-2e-16
job,0.640060260513848,-7e-16
lazy,5e-16,0.48014388777103
peace,-1e-16,0.0
politics,-1e-16,0.0
