In [1]:
import pandas as pd

In [130]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [131]:
df=pd.read_csv('./dataset/quora_questions_filtered.csv')

In [132]:
df.head()

Unnamed: 0,Question,doclen
0,"Like everyone else (here in U.S), I work with ...",125
1,Hello dear's people. i have a fictief research...,80
2,A lady buys goods worth 200 bucks from a shop ...,77
3,I am turning 25 in about a month and am curren...,73
4,I'm a 34 years old married to a woman. I had a...,73


In [133]:
df.describe()

Unnamed: 0,doclen
count,30735.0
mean,24.733594
std,5.379956
min,20.0
25%,21.0
50%,23.0
75%,27.0
max,125.0


In [135]:
df['Question']=df['Question'].apply(lambda x: x.lower())


In [136]:
documents=list(df['Question'])

In [137]:
cv=CountVectorizer(max_df=0.95,min_df=2,stop_words='english')
dtm=cv.fit_transform(documents)

In [138]:
n_topics = 6  # Adjust the number of topics as needed
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(dtm)

In [139]:
lda.components_


array([[  0.33280941,  12.33900197,   0.16701633, ...,   0.16666712,
          3.16624711,   3.16624711],
       [  1.26250787, 118.60180646,  15.6082387 , ...,   0.16666698,
          0.16666697,   0.16666697],
       [  0.1667941 ,   0.1672116 ,   0.16691145, ...,   0.16666712,
          0.1666671 ,   0.1666671 ],
       [ 10.78295148,  23.32235184,   1.25711401, ...,   0.16666696,
          0.16708481,   0.16708481],
       [  0.16685604,  13.29800542,   1.294228  , ...,   0.16666712,
          0.1666671 ,   0.1666671 ],
       [  1.28808109,   3.27162271,   1.5064915 , ...,   2.1666647 ,
          0.16666692,   0.16666692]])

In [129]:
filtered_corpus

[['like',
  'else',
  'us',
  'work',
  'group',
  'people',
  'see',
  'every',
  'day',
  'human',
  'views',
  'work',
  'place',
  'want',
  'know',
  'state',
  'someone',
  'next',
  'happens',
  '12',
  'year',
  'old',
  'going',
  'wrong',
  'say'],
 ['people',
  'research',
  'someone',
  'away',
  'desert',
  'point',
  'one',
  'say',
  'chance',
  'could',
  'yes',
  'much',
  'chance',
  'answer',
  'im',
  'looking',
  'long',
  'time'],
 ['1000', 'change', 'next', 'next', '1000', 'note', 'money', 'back', 'much'],
 ['month',
  'currently',
  'life',
  'degree',
  'college',
  'currently',
  'company',
  'pay',
  'would',
  'start',
  'family',
  'get',
  'married',
  'job',
  'go',
  'back',
  'school'],
 ['im',
  'years',
  'old',
  'married',
  'woman',
  '12',
  'years',
  'friend',
  'also',
  'married',
  'woman',
  'says',
  'always',
  'made',
  'wants',
  'really',
  'love',
  'take',
  'another'],
 ['told',
  'make',
  'like',
  'get',
  'offer',
  'us',
  'base

In [140]:
topic1=lda.components_[0]


In [141]:
topic1.argsort()


array([ 6306,  2567, 13645, ...,    14,   403,  9501], dtype=int64)

In [142]:
for index in topic1.argsort()[-30:]:
    print(cv.get_feature_names_out()[index])

did
note
possible
gobi
temperatures
modi
use
decision
number
does
cold
2000
government
india
ones
indian
compare
math
rupee
phone
average
new
card
rs
desert
money
black
1000
500
notes


In [143]:
def PrintTopics(model,cv,n_words):
    for index,topic in enumerate(model.components_):
        print(f"The top {n_words} words for topic #{index}")
        print([cv.get_feature_names_out()[index] for index in topic.argsort()[-n_words:]])
        print('\n')

In [144]:
PrintTopics(lda,cv,20)


The top 20 words for topic #0
['cold', '2000', 'government', 'india', 'ones', 'indian', 'compare', 'math', 'rupee', 'phone', 'average', 'new', 'card', 'rs', 'desert', 'money', 'black', '1000', '500', 'notes']


The top 20 words for topic #1
['working', 'know', 'data', 'good', 'online', 'does', 'buy', 'india', 'work', 'learn', 'make', 'people', 'company', 'best', 'need', 'use', 'business', 'start', 'want', 'like']


The top 20 words for topic #2
['mean', 'number', 'like', 'password', 'email', 'facebook', 'harassment', 'couples', 'unmarried', 'moral', 'staff', 'major', 'compare', 'safe', 'earthquake', 'account', 'effects', 'does', 'hotel', 'police']


The top 20 words for topic #3
['friend', 'guy', 'make', 'life', 'know', 'home', 'people', 'year', 'want', 'don', 'love', 'just', 'feel', 'good', 'girl', 'like', 'old', 'years', 'does', 'time']


The top 20 words for topic #4
['space', 'potential', 'water', 'speed', 'vacuum', 'united', 'states', 'light', 'earth', 'somme', 'contrast', 'signif

In [145]:
def SaveTopicsToFile(model, cv, n_words, file_name,title):
    with open(file_name, 'w') as f:
        f.write(title + '\n\n')
        for index, topic in enumerate(model.components_):
            f.write(f"The top {n_words} words for topic #{index}:\n")
            top_words = [cv.get_feature_names_out()[i] for i in topic.argsort()[-n_words:]]
            f.write(', '.join(top_words) + '\n\n')
    print(f"Topics have been successfully saved to {file_name}")

In [146]:
# SaveTopicsToFile(lda, cv, 20, 'topics.txt','updated code and labels for topics')


In [147]:
import pickle

def SaveModel(model, file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {file_name}")

In [113]:
def PrintTopics(model,cv,n_words):
    for index,topic in enumerate(model.components_):
        print(f"The top {n_words} words for topic #{index}")
        print([cv.get_feature_names_out()[index] for index in topic.argsort()[-n_words:]])
        print('\n')

In [114]:
PrintTopics(lda,cv,20)

The top 20 words for topic #0
['cold', '2000', 'government', 'india', 'ones', 'indian', 'compare', 'math', 'rupee', 'phone', 'average', 'new', 'card', 'rs', 'desert', 'money', 'black', '1000', '500', 'notes']


The top 20 words for topic #1
['working', 'know', 'data', 'good', 'online', 'does', 'buy', 'india', 'work', 'learn', 'make', 'people', 'company', 'best', 'need', 'use', 'business', 'start', 'want', 'like']


The top 20 words for topic #2
['mean', 'number', 'like', 'password', 'email', 'facebook', 'harassment', 'couples', 'unmarried', 'moral', 'staff', 'major', 'compare', 'safe', 'earthquake', 'account', 'effects', 'does', 'hotel', 'police']


The top 20 words for topic #3
['friend', 'guy', 'make', 'life', 'know', 'home', 'people', 'year', 'want', 'don', 'love', 'just', 'feel', 'good', 'girl', 'like', 'old', 'years', 'does', 'time']


The top 20 words for topic #4
['space', 'potential', 'water', 'speed', 'vacuum', 'united', 'states', 'light', 'earth', 'somme', 'contrast', 'signif

In [115]:
def SaveTopicsToFile(model, cv, n_words, file_name,title):
    with open(file_name, 'w') as f:
        f.write(title + '\n\n')
        for index, topic in enumerate(model.components_):
            f.write(f"The top {n_words} words for topic #{index}:\n")
            top_words = [cv.get_feature_names_out()[i] for i in topic.argsort()[-n_words:]]
            f.write(', '.join(top_words) + '\n\n')
    print(f"Topics have been successfully saved to {file_name}")


In [None]:
SaveTopicsToFile(lda, cv, 20, 'topics_10.txt','preprocessed data with data length greter than 20 and total topics 6')

Topics have been successfully saved to topics.txt


In [117]:
import pickle

def SaveModel(model, file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {file_name}")


In [148]:
# SaveModel(lda, 'lda_model_6_topics_str_len_20.pkl')

In [39]:
# def LoadModel(file_name):
#     with open(file_name, 'rb') as f:
#         model = pickle.load(f)
#     print(f"Model loaded from {file_name}")
#     return model


In [41]:
# newnist=LoadModel('Models/lda_model_30_topics_str_len_20.pkl')

Model loaded from Models/lda_model_30_topics_str_len_20.pkl


In [150]:
from transformers import pipeline

# Load a text classification model (you can choose a specific model for your needs)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def generate_label(words,candidate_labels):
    input_text = f"Words: {', '.join(words)}"
    
    # Candidate labels to guide classification, this can be modified based on common topics in your data

    generated = classifier(input_text, candidate_labels=candidate_labels)
    
    # Return the top label
    return generated['labels'][0]

  from .autonotebook import tqdm as notebook_tqdm


In [153]:
candidate_labels = [
    "Government",
    "Online Business",
    "Social Harassment",
    "Relationships",
    "Space and Science",
    "Exam and university",
    "carrer",
    "Education",
]

In [154]:
topic_to_label = {}
for index, topic in enumerate(lda.components_):
    top_words = [cv.get_feature_names_out()[i] for i in topic.argsort()[-30:]]
    print(f"Topic {index}",top_words)
    label = generate_label(top_words, candidate_labels)
    print(f"Topic {index} is about {label}")
    topic_to_label[index] = label

Topic 0 ['did', 'note', 'possible', 'gobi', 'temperatures', 'modi', 'use', 'decision', 'number', 'does', 'cold', '2000', 'government', 'india', 'ones', 'indian', 'compare', 'math', 'rupee', 'phone', 'average', 'new', 'card', 'rs', 'desert', 'money', 'black', '1000', '500', 'notes']
Topic 0 is about Government
Topic 1 ['way', 'idea', 'big', 'pay', 'new', 'time', 'years', 'website', 'using', 'app', 'working', 'know', 'data', 'good', 'online', 'does', 'buy', 'india', 'work', 'learn', 'make', 'people', 'company', 'best', 'need', 'use', 'business', 'start', 'want', 'like']
Topic 1 is about Online Business
Topic 2 ['sent', 'person', 'movie', 'message', 'whatsapp', 'don', 'hill', 'station', 'cambodia', 'instagram', 'mean', 'number', 'like', 'password', 'email', 'facebook', 'harassment', 'couples', 'unmarried', 'moral', 'staff', 'major', 'compare', 'safe', 'earthquake', 'account', 'effects', 'does', 'hotel', 'police']
Topic 2 is about Social Harassment
Topic 3 ['sex', 'mean', 'foreign', 'girlf

In [155]:
topic_to_label

{0: 'Government',
 1: 'Online Business',
 2: 'Social Harassment',
 3: 'Relationships',
 4: 'carrer',
 5: 'Exam and university'}

## Generate The Topic Results from our Model LDA , that is assign probabilities for a document beonging to a certain topic 

In [156]:
topic_results=lda.transform(dtm)


In [157]:
topic_results[0]

array([0.12795593, 0.27106877, 0.00344306, 0.59067972, 0.00342685,
       0.00342567])

In [158]:
topic_results[0].round(2)

array([0.13, 0.27, 0.  , 0.59, 0.  , 0.  ])

## get the topic index from the topic probabilities which is the highest to belong to 


In [159]:
topic_results[0].argmax()

3

In [160]:
document=documents[0]
document

'like everyone else (here in u.s), i work with a group of people i see and interact with every day. as a human rights advocate and lesbian, my views are liberal and i by no means preach my beliefs in my work place. however, i want to know, when is it ok to state your opinions on an issue. for example, if someone mentions that their next door neighbor, who happens to be a 12 year old boy, was playing with dolls and that "this is going down the wrong road and he is strange/weird", is it ok for me to say that we shouldn\'t judge him for who he is? so confused... this just happened at lunch and i am very upset about it...'

In [161]:
topic_to_label[topic_results[0].argmax()]

'Relationships'

In [162]:
# adding Topic label for the dataframe
df['TopicId']=topic_results.argmax(axis=1)
df['Topic'] = [topic_to_label[topic.argmax()] for topic in topic_results]

In [163]:
df.head(20)

Unnamed: 0,Question,doclen,TopicId,Topic
0,"like everyone else (here in u.s), i work with ...",125,3,Relationships
1,hello dear's people. i have a fictief research...,80,3,Relationships
2,a lady buys goods worth 200 bucks from a shop ...,77,3,Relationships
3,i am turning 25 in about a month and am curren...,73,5,Exam and university
4,i'm a 34 years old married to a woman. i had a...,73,3,Relationships
5,my employer has told me that we can not accept...,72,1,Online Business
6,i have been using my girlfriend for a sexual r...,71,3,Relationships
7,"you have 100 coins laying flat on a table, eac...",68,5,Exam and university
8,i broke up with him. i love him so much but we...,68,3,Relationships
9,i broke up with him. i love him so much but we...,68,3,Relationships


In [164]:
df.iloc[0]['Question']

'like everyone else (here in u.s), i work with a group of people i see and interact with every day. as a human rights advocate and lesbian, my views are liberal and i by no means preach my beliefs in my work place. however, i want to know, when is it ok to state your opinions on an issue. for example, if someone mentions that their next door neighbor, who happens to be a 12 year old boy, was playing with dolls and that "this is going down the wrong road and he is strange/weird", is it ok for me to say that we shouldn\'t judge him for who he is? so confused... this just happened at lunch and i am very upset about it...'

In [146]:
df.to_csv('./results/quora_questions_filtered_with_topics.csv')