In [1]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
## read in the cleaned data
df = pd.read_csv('cleaned_quora_data.csv')

print(df.shape)

(1306122, 3)


In [40]:
## check if the dataframe has any null values
df = df.dropna()

In [41]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,quebec nationalist see province nation s,0
1,000032939017120e6e44,adopted dog would encourage people adopt shop,0
2,0000412ca6e4628ce2cf,velocity affect time velocity affect space geo...,0
3,000042bf85aa498cd78e,otto von guericke used magdeburg hemisphere,0
4,0000455dfa3e01eae3af,convert montra helicon mountain bike changing ...,0


In [42]:
df.shape

(1305904, 3)

In [43]:
corpus = df['question_text'].tolist()

In [44]:
corpus[:10]

['quebec nationalist see province nation s',
 'adopted dog would encourage people adopt shop',
 'velocity affect time velocity affect space geometry',
 'otto von guericke used magdeburg hemisphere',
 'convert montra helicon mountain bike changing tyre',
 'gaza slowly becoming auschwitz dachau treblinka palestinian',
 'quora automatically ban conservative opinion reported liberal view',
 'crazy wash wipe grocery germ everywhere',
 'thing dressing moderately different dressing modestly',
 'ever phase wherein became ignorant people loved completely disregarding feeling life get something go way feel temporarily ease thing change']

In [47]:
vectorizer = TfidfVectorizer(min_df = 100)
train_text = vectorizer.fit_transform(df['question_text'])

In [48]:
# get top tf-idf words across all documents
weights = np.asarray(train_text.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'weight': weights})

In [52]:
weights_df.sort_values(by=["weight"], ascending=False).head(100)

Unnamed: 0,term,weight
761,best,0.013084
3018,get,0.011954
5242,people,0.010481
7868,would,0.010465
4163,like,0.009783
...,...,...
4235,look,0.002371
7406,type,0.002365
7031,system,0.002351
7703,war,0.002320


In [53]:
weights_df.to_csv('weights.csv', index=False)

In [45]:
tokens = weights_df['term'].tolist()

In [46]:
tokens[:10]

['aa',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaabbbcccbaac',
 'aaadhar',
 'aaak',
 'aaaq',
 'aaat',
 'aab']

In [20]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer=train_text[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=vectorizer.get_feature_names(), columns=["tf_idf"])
df.sort_values(by=["tf_idf"],ascending=False)

Unnamed: 0,tf_idf
quebec,0.522078
province,0.496858
nationalist,0.492266
nation,0.378241
see,0.308505
...,...
gagan,0.000000
gaganbawda,0.000000
gagaran,0.000000
gagarin,0.000000


In [None]:
## truncating matrix using SVD
#n_comp = train_text.shape[1]-1
svd = TruncatedSVD(n_components=6074, algorithm='arpack', random_state=42)
svd_transform = svd.fit_transform(train_text)

Extract topics and terms After performing SVD, we need to extract the topics from the component matrix. Let’s see the example below:

In [None]:
# Print the topics with their terms
terms = vectorizer.get_feature_names()

for index, component in enumerate(svd.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10]
    top_terms_list=list(dict(top_terms_key).keys())
    #print("Topic "+str(index)+": ",top_terms_list)

In [None]:
print(svd.explained_variance_ratio_.sum())

In [None]:
# List of explained variances
svd_var_ratios = svd.explained_variance_ratio_

In [None]:
# Run function
select_n_components(svd_var_ratios, 0.90)

In [None]:
# Run function
select_n_components(svd_var_ratios, 0.95)

In LSI method,3 original terms for the collection are those individ- ual words whose term frequency in the document is more than two. The original term weight of an individual word in a document is set as the corresponding term frequency of that individual word in that document. Then, SVD is used to decompose the original term-docu- ment matrix. Next, we retain a certain percentage of singular values in R of Eq. (8) to produce the approximation matrix, which has lower dimensions than the original term-document matrix. 

In the next section, we review Latent Semantic Indexing (LSI) (2; 1), which uses the truncated singular value decomposition (SVD) as a low-rank approximation of A. Although the truncated SVD provides the closest approximation to A in Frobenius or L2 norm, LSI ignores the cluster structure while reducing the dimension of the data

LSI is based on the assumption that there is some underlying latent semantic structure in the term- document matrix that is corrupted by the wide variety of words used in documents and queries. This is referred to as the problem of polysemy and synonymy (6). The basic idea is that if two document vectors represent the same topic, they will share many associating words with a keyword, and they will have very close semantic structures after dimension reduction via SVD. Thus LSI/SVD breaks the original relationship of the data into linearly independent components (6), where the original term vectors are represented by left singular vectors and document vectors by right singular vectors. That is, if l ≤ rank(A), then

The most representative feature extraction algorithm is the Latent Semantic Indexing (LSI) which is an automatic method that transforms the original textual data to a smaller semantic space by taking advantage of some of the implicit higher-order structure in associations of words with text objects [1,2]. The transformation is computed by applying truncated singular value decomposition (SVD) to the term-by-document matrix. After SVD, terms which are used in similar contexts will be merged together. Thus, documents using different terminology to talk about the same concept can be positioned near each other in the new space [17].