# OBJECTIVE 5 - Topic Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("dupes_removed.csv")

In [21]:
df.columns

Index(['text', 'text.1', 'favoriteCount', 'replyToSN', 'created', 'truncated',
       'replyToSID', 'id', 'replyToUID', 'statusSource', 'screenName',
       'retweetCount', 'isRetweet', 'retweeted', 'longitude', 'latitude',
       'text_without_hashtag', 'expanded_tweet', 'punctuation_removed',
       'tokenized_tweets', 'tokenized_stopword_removed_tweets',
       'almost_clean_tweets', 'lemmatized_tweets', 'Topic_0', 'Topic_1',
       'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_6', 'Topic_7',
       'Topic_8', 'Topic_9', 'Topic_10', 'Topic_11'],
      dtype='object')

In [5]:
vect = TfidfVectorizer(min_df=2, max_df=0.90, stop_words='english')
# vectorizing the tweets

In [6]:
X = vect.fit_transform(df['lemmatized_tweets'])

In [7]:
lda = LDA(n_components = 12, learning_method="batch", max_iter=50, random_state=0)

document_topics = lda.fit_transform(X)

#n_components is the number of topics
# learning method is batch which refers to batch gradient descent
# max_iter is the number of times the learning algorithm will run

In [8]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

# lda.components_ is a topic-term distribution where the rows are the topics and the columns are the features or words.
# the argsort function sorts the words in descending order (increasing order) of importance in the topic.
#get feature names is converting the vectors back into the words i.e the features are being retrieved



In [10]:
import mglearn
mglearn.tools.print_topics(topics=range(12), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=30)

# mglearn library has a function which is used to print topics and their associated words. 
# topics is the number of topics
# feature names are the words
# sorting provided the sorted indices in which the words/features must be displayed
# topics_per_chunk is for printing purposes
# n_words is for the number of words to print per topic

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
trump         movement      woman         woman         year          
resign        woman         man           sexual        person        
franken       change        movement      harassment    time          
sexual        moment        christmas     assault       movement      
al            man           meryl         story         silence       
weinstein     article       streep        speak         2017          
harvey        backlash      sexual        man           breaker       
allegation    campaign      know          victim        magazine      
misconduct    video         like          abuse         cover         
accuser       read          make          come          woman         
woman         social        say           thank         mueller       
donald        like          harassment    stand         march         
senato

In [13]:
for topic in range(12):
    column_name = f"Topic_{topic}"  
    df[column_name] = document_topics[:, topic]
    
# this code makes a column for each topic where each row indicates the 
# probability of that tweet belonging to the topic that column is dedicated to.

In [19]:
df[['text', 'Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_6', 'Topic_7','Topic_8', 'Topic_9', 'Topic_10', 'Topic_11']]

Unnamed: 0,text,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11
0,! are in power. doing all they can to protect...,0.027961,0.027959,0.027960,0.692435,0.027959,0.027965,0.027959,0.027959,0.027959,0.027961,0.027959,0.027961
1,"! labels it ""powerful male pattern blindness"" ...",0.022495,0.022496,0.312065,0.022496,0.022498,0.022495,0.462979,0.022495,0.022495,0.022496,0.022495,0.022495
2,! somebodys been reading my tweets! those rega...,0.026046,0.026047,0.026048,0.026047,0.026047,0.026046,0.026047,0.026046,0.713486,0.026047,0.026047,0.026046
3,! the #resistance continues. welcome to 2018. ...,0.023190,0.023190,0.023190,0.023192,0.380803,0.023190,0.023189,0.191474,0.023189,0.023189,0.023190,0.219013
4,! this is the judge who gave brock turner 6 mo...,0.024431,0.024427,0.024427,0.024427,0.024427,0.024427,0.024427,0.024427,0.024427,0.731296,0.024429,0.024427
...,...,...,...,...,...,...,...,...,...,...,...,...,...
133563,~in an attempt to beat back his own personal #...,0.757006,0.022091,0.022090,0.022091,0.022090,0.022090,0.022091,0.022090,0.022090,0.022090,0.022091,0.022090
133564,~right now you are in a rush to push #goptaxsc...,0.026186,0.026185,0.026185,0.026187,0.026185,0.026186,0.711957,0.026184,0.026186,0.026187,0.026187,0.026185
133565,~whew... that wasnt easy to recall... working ...,0.021259,0.021257,0.021257,0.021258,0.021256,0.021257,0.021257,0.021256,0.021257,0.021256,0.322132,0.465299
133566,~with all this news....i will call that woman\...,0.024380,0.024379,0.024379,0.373289,0.382916,0.024379,0.024379,0.024379,0.024380,0.024379,0.024379,0.024379


# Finding and printing the top tweets for every topic with its probability of belonging to that topic

In [4]:
for i in range(12):  # Loop through Topic_0 to Topic_11
    column_name = f"Topic_{i}"
    max_value = df[column_name].max()
    max_value_index = df[column_name].idxmax()
    max_value_text = df.loc[max_value_index, "text"]

    print(f"Top Tweet for {column_name}: {max_value_text}")
    print(f"Probability of being in '{column_name}': {max_value}")
    print("\n")  # Add a line break between topics


Top Tweet for Topic_0: out of control #metoo women's right lawyer lisa bloom sought money from hillary clinton donors to pay women who alleged t
Probability of being in 'Topic_0': 0.8036041644167174


Top Tweet for Topic_1: never let you go meets speak #metoo story. 15 year old science geek a gets date raped by obsessed college student w
Probability of being in 'Topic_1': 0.8001339240167317


Top Tweet for Topic_2: summary tweet #metoo: i urge men pledge support to partners, daughters, sisters, friends and colleagues; sexual assaults criminal > report.
Probability of being in 'Topic_2': 0.8061891302890799


Top Tweet for Topic_3: #metoo began as great social change nearly always does, with individual acts of courage, later became hashtag, movement, reckoning. (time)
Probability of being in 'Topic_3': 0.7976866451602734


Top Tweet for Topic_4: time magazine "person of the year" nominations "short list": n korea kim jong un, robert mueller, colin kaepernick
Probability of being in 'Topi

# Top 10 Tweets for **Topic 0** with their probabilities of being in topic 0
- This topic conatins tweets about the political aspects of the movement such as congressmen, lawyers, senate members.

In [5]:
# Find the top 10 maximum values in 'Topic_0'
top_10_values = df.nlargest(10, 'Topic_0')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_0']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 0:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 0':\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 0:

1. Probability of being in Topic 0':	0.8036041644167174
Tweet:	out of control #metoo women's right lawyer lisa bloom sought money from hillary clinton donors to pay women who alleged t


2. Probability of being in Topic 0':	0.8020464643231907
Tweet:	so a republican congressman used 84k in taxpayer money to cover an sexual abuse case and media silence but al frank


3. Probability of being in Topic 0':	0.8017576817838863
Tweet:	gop congressman blake farenthold (tx) used 84k of taxpayer money to settle a sexual harassment suit, then blackball


4. Probability of being in Topic 0':	0.7994428817100809
Tweet:	new: top senate democrat chuck schumer says sen. al franken should resign as sexual misconduct allegations multiply. #metoo


5. Probability of being in Topic 0':	0.7987430727273437
Tweet:	democrats forced al franken to resign  now they must act on trump, moore." dems: use every single tool at your di


6. Probability of being i

# Top 10 Tweets for **Topic 1** with their probabilities of being in topic 1
- Topic 1 conatins tweets about empowering survivors, with ideas such as movies that highligh the plight of survivors, creating communities of strong people and discussing the role colleges and other institutions play in creating a "good" atmosphere

In [6]:
# Find the top 10 maximum values in 'Topic_1'
top_10_values = df.nlargest(10, 'Topic_1')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_1']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 1:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 1:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 1:

1. Probability of being in Topic 1:	0.8001339240167317
Tweet:	never let you go meets speak #metoo story. 15 year old science geek a gets date raped by obsessed college student w


2. Probability of being in Topic 1:	0.7990546150650133
Tweet:	hollywoods jus using #metoo 2 usher out the old players cuz i guarantee nothings changing n the culture to stop sexual battery and abuse.


3. Probability of being in Topic 1:	0.7947321199695845
Tweet:	[event] #metoo  now what? creating communities of strong people | dec 4th 7-9pm let your voice be heard at our panel disc


4. Probability of being in Topic 1:	0.7940585119392116
Tweet:	the thing is, a big % of the 90s kids watched charmed & saw paige, piper & phoebe as sisters & looked forward to jo


5. Probability of being in Topic 1:	0.7914724475634107
Tweet:	we need movie like star wars now more than ever to give hope that small groups of people fighting for good things c


6. Probability

# Top 10 Tweets for **Topic 2** with their probabilities of being in topic 2
- Topic 2 deals with sexual harrasment in workplaces such as offices as well as the mistreatment women face due to sexism and misogyny

In [7]:
# Find the top 10 maximum values in 'Topic_2'
top_10_values = df.nlargest(10, 'Topic_2')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_2']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 2:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 2:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 2:

1. Probability of being in Topic 2:	0.8061891302890799
Tweet:	summary tweet #metoo: i urge men pledge support to partners, daughters, sisters, friends and colleagues; sexual assaults criminal > report.


2. Probability of being in Topic 2:	0.8028853789054945
Tweet:	breaking news: multiple nbc employees file sex harassment claim against 30-rock christmas tree ("looks like a giant


3. Probability of being in Topic 2:	0.7967849675275585
Tweet:	#metoo shakes swedish athletics as 400m euro gold medallist moa hjelmer discloses rape & susanna kallur (world indoor recor


4. Probability of being in Topic 2:	0.7967073983609635
Tweet:	folks, maybe we should let his boss, tx ag ken paxton, know how we feel about this deplorable conduct from a public


5. Probability of being in Topic 2:	0.7965740582535719
Tweet:	yes men are seriously asking me this questions and treating them like threats in the wake of the glenn thrush news today #metoo


# Top 10 Tweets for **Topic 3** with their probabilities of being in topic 3
- Topic 3 talks about the impact of the metoo movement, calling the MeToo movement a great social change as well as the trajectory of the movement.

In [8]:
# Find the top 10 maximum values in 'Topic_3'
top_10_values = df.nlargest(10, 'Topic_3')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_3']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 3:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 3:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 3:

1. Probability of being in Topic 3:	0.7976866451602734
Tweet:	#metoo began as great social change nearly always does, with individual acts of courage, later became hashtag, movement, reckoning. (time)


2. Probability of being in Topic 3:	0.7973552085337224
Tweet:	a2: b/c we want to shine light on the experiences & reality of women's lives today. and help prevent anymore #metoo #teartalk


3. Probability of being in Topic 3:	0.7969008094049699
Tweet:	let's talk about how the lack of free long term mental health resources for sexual abuse survivors pushes them to h


4. Probability of being in Topic 3:	0.7956973225545325
Tweet:	sexual harassment, misconduct, rape ... in just 2 months, media reported 35 high profile men accused. a lot you won


5. Probability of being in Topic 3:	0.7923938259603083
Tweet:	1st saying woman is crazy let's tell the truth and shame the _ white woman don't care about #metoo they lie when th


6. Probab

# Top 10 Tweets for **Topic 4** with their probabilities of being in topic 4
- Topic 4 is about the time magzine's nominations for the 2017's person of the year where it had added and later named the "MeToo movement" as person of the year.

In [9]:
# Find the top 10 maximum values in 'Topic_4'
top_10_values = df.nlargest(10, 'Topic_4')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_4']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 4:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 4:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 4:

1. Probability of being in Topic 4:	0.8025956155688057
Tweet:	time magazine "person of the year" nominations "short list": n korea kim jong un, robert mueller, colin kaepernick


2. Probability of being in Topic 4:	0.8020768131016623
Tweet:	trump, xi, kim, mbs, the #metoo movement, patty jenkins, colin kaepernick, robert mueller, the dreamers and jeff bezos


3. Probability of being in Topic 4:	0.7977595617436035
Tweet:	zeta extends well wishes to soror anita hill, esq., as she leads the newly formed commission that will address sexu


4. Probability of being in Topic 4:	0.7972593134776249
Tweet:	anita hills accusations against clarence thomas launched the first #metoo moment 26 years ago, but the justice has faced li


5. Probability of being in Topic 4:	0.7960301872983279
Tweet:	yes to the dreamers, colin kaepernick, robert mueller, & the #metoo movement; hell fucking no to donald trump or kim jung un


6. Probability of being

# Top 10 Tweets for **Topic 5** with their probabilities of being in topic 5
- Topic 5 deals with prominent peoplein political parties and accusation on them with names like Roy Moore, Trump, lynne callahan bein talked about multiple time.

In [10]:
# Find the top 10 maximum values in 'Topic_5'
top_10_values = df.nlargest(10, 'Topic_5')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_5']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 5:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 5:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 5:

1. Probability of being in Topic 5:	0.8048973277872772
Tweet:	rt cnn "5 reasons you need to pay attention to the roy moore race:- trump's policy agenda- the 2018 election- t


2. Probability of being in Topic 5:	0.8040889995388389
Tweet:	#metoo alabama pedofile defeated in senate race! x cop lynne callahan falsified "2nd batch inventory sheet" frame


3. Probability of being in Topic 5:	0.8008622122455931
Tweet:	women are winning. robin wright will make final season of house of cards with no male lead & christian amanpour wil


4. Probability of being in Topic 5:	0.7994166253104229
Tweet:	if roy moore wins the alabama senate race, despite being a pedophile, i will lose faith in americans ability to ch


5. Probability of being in Topic 5:	0.7993784761599138
Tweet:	republican candidate christian judge roy moore didnt lose the alabama election last night from fake sex harassment


6. Probability of being in Topic 5:	0.799206895130

# Top 10 Tweets for **Topic 6** with their probabilities of being in topic 6
- Topic 6 deals with people criticing the movement as well as calling out people who are criticing the movement.

In [11]:
# Find the top 10 maximum values in 'Topic_6'
top_10_values = df.nlargest(10, 'Topic_6')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_6']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 6:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 6:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 6:

1. Probability of being in Topic 6:	0.8026404573378955
Tweet:	this bully thing turning into the newest #metoo. makes ya think the world really has gone dumbass mad for social media attention.


2. Probability of being in Topic 6:	0.7981093024103139
Tweet:	so this entire time on twitter for some reason i thought the #metoo hashtag very small icon was a vagina. nope. it's actually raised hands.


3. Probability of being in Topic 6:	0.7967826901771199
Tweet:	rt scottpresler: reminder: congress used $17 million from a taxpayer funded hush slush fund to pay off sexual ass


4. Probability of being in Topic 6:	0.7958155348843653
Tweet:	texas attorney generals top aide mocks pathetic #metoo movement and calls womens marchers c*nts | raw story https://


5. Probability of being in Topic 6:	0.7951977931429788
Tweet:	national enquirers ami defies #metoo, defends piggish behaviorthe #trump-aligned media company says its chief content offic

# Top 10 Tweets for **Topic 7** with their probabilities of being in topic 7
-  Topic 7 is about the demonstrations that were inspired by the MeToo movement such as the clothesline project and changes brought about due to the movement.

In [12]:
# Find the top 10 maximum values in 'Topic_7'
top_10_values = df.nlargest(10, 'Topic_7')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_7']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 7:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 7:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 7:

1. Probability of being in Topic 7:	0.7954416249155964
Tweet:	new story on npr: rows of hot pink paper, all saying #metoo  mexican artist monica mayer bri


2. Probability of being in Topic 7:	0.7946684265960687
Tweet:	lady tecniq news at the golden globes, hollywood men will joining actresses in wearing all black - a celebrity styl


3. Probability of being in Topic 7:	0.7903370647695275
Tweet:	rows of hot pink paper, all saying #metoomexican artist monica mayer brings her "clothesline project" to the nati


4. Probability of being in Topic 7:	0.7897404585189737
Tweet:	new article (is #metoo movement spoiling christmas? fox news host stirs up outrage online) has been published on


5. Probability of being in Topic 7:	0.7855444570818696
Tweet:	former days of our livesstar shares creepy story of sexual extortion #metoo - colombian soap opera star ximena duq


6. Probability of being in Topic 7:	0.7843512957036766
Tweet:	it's oliv

# Top 10 Tweets for **Topic 8** with their probabilities of being in topic 8
- Topic 8 highlights survivor stories and talks about dr chris rocks and his books teaching children about appropriate touches.

In [13]:
# Find the top 10 maximum values in 'Topic_8'
top_10_values = df.nlargest(10, 'Topic_8')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_8']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 8:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 8:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 8:

1. Probability of being in Topic 8:	0.7987571478570779
Tweet:	my dad sexually molested me at 9, 10, 11, 12 and 13 than i ran away. he has been dead for 15 years and just this ye


2. Probability of being in Topic 8:	0.7982640857823382
Tweet:	matt damon is a fucking idiot .. yes lets talk more about white privileged males that didnt do shit instead of ta


3. Probability of being in Topic 8:	0.7979958449738749
Tweet:	dr mark rocks  free colouring-in books  teaching kids about in appropriate touch google koala and bunny


4. Probability of being in Topic 8:	0.7979001612987376
Tweet:	dr chris rocks.  free colouring-in books  teaching kids about in appropriate touch google koala and bunny


5. Probability of being in Topic 8:	0.7973481496556633
Tweet:	twitter allows leftwing rosie to target, sexually harass & incite hate on conservative jew ben shapiro. #metoo


6. Probability of being in Topic 8:	0.7969093213006401
Tweet:	emminent 

# Top 10 Tweets for **Topic 9** with their probabilities of being in topic 9
- Topic 9 focuses on tarana burke the woman who created the movement in 2006 for empowering and uniting women of color to speak up about the abuse they faced

In [14]:
# Find the top 10 maximum values in 'Topic_9'
top_10_values = df.nlargest(10, 'Topic_9')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_9']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 9:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 9:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 9:

1. Probability of being in Topic 9:	0.8080671121239231
Tweet:	tarana burke  the woman who created #metoo 10 years ago  will kick off the countdown for time square's new year's eve ball dr


2. Probability of being in Topic 9:	0.8047832297140337
Tweet:	hello 2018!  #metoo creator tarana burke will push button to drop new years eve ball in times square:


3. Probability of being in Topic 9:	0.8032858829053441
Tweet:	new post (#metoo creator tarana burke set to drop new year's eve ball in times square) has been published on the da


4. Probability of being in Topic 9:	0.7991298147529519
Tweet:	#metoo creator will push button to drop new year's eve ball in times square - activist tarana burke founded the


5. Probability of being in Topic 9:	0.7985148113445433
Tweet:	#metoo creator tarana burke will  be pressing the button to start the minute countdown to 2018 and signal the ball drop in times


6. Probability of being in Topic 9:	0

# Top 10 Tweets for **Topic 10** with their probabilities of being in topic 10
- Topic 10 deals with people talking about understanding sexual harassment & assault experiences and supporting women and girls and keeping them safe

In [15]:
# Find the top 10 maximum values in 'Topic_10'
top_10_values = df.nlargest(10, 'Topic_10')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_10']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 10:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 10:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 10:

1. Probability of being in Topic 10:	0.793611711007424
Tweet:	1/ dear straight men trying to understand sexual harassment & assault: forget trying to fathom what your mother, wi


2. Probability of being in Topic 10:	0.7923935584202755
Tweet:	i really hope the people from alabama make the right choice and protect young innocent girls, like m


3. Probability of being in Topic 10:	0.7923742542110545
Tweet:	oh yeah plus she's really a bad actress in all honesty. honestly y'all go look at her stupid #metoo speech lol


4. Probability of being in Topic 10:	0.7918584572820225
Tweet:	liberals no longer believe in due process or innocent unless proved guilty. women are always right, can't lie. live


5. Probability of being in Topic 10:	0.7905751361817809
Tweet:	#metoodamn. watch: cnn host left speechless as ex-fox news analyst breaks her confidentiality agreement and exposes rup


6. Probability of being in Topic 10:	0.79016942082983

# Top 10 Tweets for **Topic 11** with their probabilities of being in topic 11
- Topic 11 deals with addressing insufficient sex education and the efforts taken to bring justice to the victims of Sexual assault and harassment.

In [16]:
# Find the top 10 maximum values in 'Topic_11'
top_10_values = df.nlargest(10, 'Topic_11')

# Create a list to store the associated text for the top values
top_10_texts = []

# Loop through the top 10 maximum values and extract their associated text
for index, row in top_10_values.iterrows():
    max_value = row['Topic_11']
    max_value_text = row['text']
    top_10_texts.append((max_value, max_value_text))
print(f"Top 10 Tweets with most probability of being in Topic 11:\n")

# Print the top 10 maximum values and their associated text
for i, (value, text) in enumerate(top_10_texts, 1):
    print(f"{i}. Probability of being in Topic 11:\t{value}")
    print(f"Tweet:\t{text}")
    print("\n")


Top 10 Tweets with most probability of being in Topic 11:

1. Probability of being in Topic 11:	0.8138730255854668
Tweet:	#metoo akron ohio star chamber orders hits: peggy pittenger=author 4x oh thoroughbred racing ass pres poisoned 16 d


2. Probability of being in Topic 11:	0.8049947057748271
Tweet:	#metoo bombshell! akron officials police conspiracy poison author 4x thoroughbred racing assn pres peggy pittenger


3. Probability of being in Topic 11:	0.7999570134574827
Tweet:	#education news: if we want to end rape culture, we need to address insufficient sex education, new book says - vic


4. Probability of being in Topic 11:	0.7979358681227745
Tweet:	#metoo justice from grave reweet akron officials conspiracy poison peggy pittenger=author 4x oh thoroughbred racing ass


5. Probability of being in Topic 11:	0.7978647110535646
Tweet:	#metoo justice from grave akron officials poison peggy pittenger=author 4x oh thoroughbred racing ass pres 4 helpin


6. Probability of being in Topic 

# Replacing the "created" column with the original "created" column after mapping with "text" column

In [17]:
df1=pd.read_csv("/Users/samriddhikumari/Desktop/PYthon/Projects/NLP_srs/checking.csv")
df1.shape

(142451, 26)

In [18]:
df2=df
df2.shape

(133567, 36)

In [19]:
# Create a mapping dictionary from df1
text_to_created_mapping = df1.set_index('text')['created'].to_dict()

# Use the mapping to replace values in df2's "created" column
df2['created'] = df2['text'].map(text_to_created_mapping)

# Print the updated df2
df2


Unnamed: 0.1,Unnamed: 0,text,text.1,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,...,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11
0,0,! are in power. doing all they can to protect...,False,1,TheLastWord,12/22/17 3:28,True,,944047000000000000,1.825035e+08,...,0.027960,0.692435,0.027959,0.027965,0.027959,0.027959,0.027959,0.027961,0.027959,0.027961
1,1,"! labels it ""powerful male pattern blindness"" ...",False,0,,11/29/17 15:10,True,,936000000000000000,,...,0.312065,0.022496,0.022498,0.022495,0.462979,0.022495,0.022495,0.022496,0.022495,0.022495
2,2,! somebodys been reading my tweets! those rega...,False,1,amjoyshow,12/16/17 16:54,True,,942076000000000000,7.311320e+17,...,0.026048,0.026047,0.026047,0.026046,0.026047,0.026046,0.713486,0.026047,0.026047,0.026046
3,3,! the #resistance continues. welcome to 2018. ...,False,5,,12/24/17 16:41,True,,944971000000000000,,...,0.023190,0.023192,0.380803,0.023190,0.023189,0.191474,0.023189,0.023189,0.023190,0.219013
4,4,! this is the judge who gave brock turner 6 mo...,False,1,RecallPersky,12/9/17 20:57,True,,939600000000000000,7.393050e+17,...,0.024427,0.024427,0.024427,0.024427,0.024427,0.024427,0.024427,0.731296,0.024429,0.024427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133562,133563,~in an attempt to beat back his own personal #...,False,0,,12/12/17 22:49,False,,940715000000000000,,...,0.022090,0.022091,0.022090,0.022090,0.022091,0.022090,0.022090,0.022090,0.022091,0.022090
133563,133564,~right now you are in a rush to push #goptaxsc...,False,1,AnalogMom,12/20/17 13:52,False,9.434710e+17,943479000000000000,4.625303e+07,...,0.026185,0.026187,0.026185,0.026186,0.711957,0.026184,0.026186,0.026187,0.026187,0.026185
133564,133565,~whew... that wasnt easy to recall... working ...,False,0,3G617,12/10/17 23:28,True,9.399990e+17,940000000000000000,1.171100e+08,...,0.021257,0.021258,0.021256,0.021257,0.021257,0.021256,0.021257,0.021256,0.322132,0.465299
133565,133566,~with all this news....i will call that woman\...,False,1,,12/8/17 21:28,True,,939245000000000000,,...,0.024379,0.373289,0.382916,0.024379,0.024379,0.024379,0.024380,0.024379,0.024379,0.024379


In [20]:
df=df2

# Visualising the Top tweets for every Topic wrt when it was tweeted

In [21]:
# Creating empty lists to store the maximum values, dates, and topic labels
max_values = []
dates = []
topic_labels = []

# Looping through Topic_0 to Topic_11
for i in range(12):
    column_name = f"Topic_{i}"
    max_value = df[column_name].max()
    max_value_index = df[column_name].idxmax()
    max_value_date = df.loc[max_value_index, "created"]

    max_values.append(max_value)
    dates.append(max_value_date)
    topic_labels.append(f"Topic_{i}")

# Creating a DataFrame for the scatter plot
scatter_df = pd.DataFrame({'Date': dates, 'Max Value': max_values, 'Topic': topic_labels})

# Sorting the DataFrame by 'Date' and reset the index
scatter_df['Date'] = pd.to_datetime(scatter_df['Date'])  # Ensure 'Date' is in datetime format
scatter_df = scatter_df.sort_values(by='Date')
scatter_df.reset_index(drop=True, inplace=True)

# Create the scatter plot
fig = px.scatter(
    scatter_df,
    x='Date',
    y='Max Value',
    color='Topic', 
    title='Top Tweets of each Topic Over Time',
    labels={'Date': 'Date and Time', 'Max Value': 'Maximum Value'},
    category_orders={'Topic': topic_labels}  # Setting the order of topics for labels
)

fig.update_xaxes(title_text='Date and Time')
fig.update_yaxes(title_text='Maximum Value')
fig.show()


  scatter_df['Date'] = pd.to_datetime(scatter_df['Date'])  # Ensure 'Date' is in datetime format


# Visualising the Top 10 tweets for every topic wrt when it was tweeted

In [22]:
# Create empty lists to store the maximum values, dates, and topic labels
max_values = []
dates = []
topic_labels = []

# Loop through Topic_0 to Topic_11
top_n = 10 # Set the number of top values to display (e.g., top 5)
for i in range(12):
    column_name = f"Topic_{i}"
    top_values = df.nlargest(top_n, column_name)  # Get the top n values for the topic
    for _, row in top_values.iterrows():
        max_values.append(row[column_name])
        dates.append(row['created'])
        topic_labels.append(f"Topic_{i}")

# Create a DataFrame for the scatter plot
scatter_df = pd.DataFrame({'Date': dates, 'Max Value': max_values, 'Topic': topic_labels})

# Sort the DataFrame by 'Date' and reset the index
scatter_df['Date'] = pd.to_datetime(scatter_df['Date'])  # Ensure 'Date' is in datetime format
scatter_df = scatter_df.sort_values(by='Date')
scatter_df.reset_index(drop=True, inplace=True)

# Create the scatter plot
fig = px.scatter(
    scatter_df,
    x='Date',
    y='Max Value',
    color='Topic',
    title='Top 10 Tweets for each Topic Over Time',
    labels={'Date': 'Date and Time', 'Max Value': 'Maximum Value'},
    category_orders={'Topic': topic_labels}  # Set the order of topics for labels
)

fig.update_xaxes(title_text='Date and Time')
fig.update_yaxes(title_text='Maximum Value')
fig.show()



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [23]:
# Creating a DataFrame for the scatter plot with date parsing
scatter_df = pd.DataFrame({'Date': pd.to_datetime(dates), 'Max Value': max_values, 'Topic': topic_labels})

# Sorting the DataFrame by 'Date' and reset the index
scatter_df = scatter_df.sort_values(by='Date')
scatter_df.reset_index(drop=True, inplace=True)



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [24]:
# Creating empty lists to store the maximum values, dates, and topic labels
max_values = []
dates = []
topic_labels = []

# Looping through Topic_0 to Topic_11
for i in range(12):
    column_name = f"Topic_{i}"
    max_value = df[column_name].max()
    max_value_index = df[column_name].idxmax()
    max_value_date = df.loc[max_value_index, "created"]

    max_values.append(max_value)
    dates.append(max_value_date)
    topic_labels.append(f"Topic_{i}")

# Creating a DataFrame for the scatter plot with date parsing
scatter_df = pd.DataFrame({'Date': pd.to_datetime(dates), 'Max Value': max_values, 'Topic': topic_labels})

# Sorting the DataFrame by 'Date' and reset the index
scatter_df = scatter_df.sort_values(by='Date')
scatter_df.reset_index(drop=True, inplace=True)


# Create the scatter plot
fig = px.scatter(
    scatter_df,
    x='Date',
    y='Max Value',
    color='Topic', 
    title='Top Tweets of each Topic Over Time',
    labels={'Date': 'Date and Time', 'Max Value': 'Maximum Value'},
    category_orders={'Topic': topic_labels}  # Setting the order of topics for labels
)

fig.update_xaxes(title_text='Date and Time')
fig.update_yaxes(title_text='Maximum Value')
fig.show()



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.

