# Imports and packages

In [7]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/DS_I"

Mounted at /content/drive
/content/drive/MyDrive/DS_I


In [8]:
!pip install --upgrade vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 7.6 MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [66]:
## helpful packages

import datetime
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
import re
import string

## nltk imports
import nltk
### uncomment and run these lines if you haven't downloaded relevant nltk add-ons yet
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
from nltk import pos_tag
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

## spacy imports
import spacy
### uncomment and run the below line if you haven't loaded the en_core_web_sm library yet
#! python3 -m spacy download en_core_web_sm
import en_core_web_sm
nlp = en_core_web_sm.load()

## vectorizer
from sklearn.feature_extraction.text import CountVectorizer

## sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## lda
from gensim import corpora
import gensim

## repeated printouts and wide-format text
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', None)

from collections import Counter

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Text analysis of Department of Justice (DOJ) press releases (50 points total)

- For background:

    - DOJ is the federal law enforcement agency responsible for federal prosecutions; this contrasts with the local prosecutions in the Cook County dataset we analyzed earlier. Here's a short explainer on which crimes get prosecuted federally versus locally: https://www.criminaldefenselawyer.com/resources/criminal-defense/federal-crime/state-vs-federal-crimes.htm#:~:text=Federal%20criminal%20prosecutions%20are%20handled,of%20state%20and%20local%20law. 
    - Here's the Kaggle that contains the data: https://www.kaggle.com/jbencina/department-of-justice-20092018-press-releases 
    - Here's the code the dataset creator used to scrape those press releases here if you're interested: https://github.com/jbencina/dojreleases
    
- See here for a codebook: https://docs.google.com/spreadsheets/d/1UopmSvFGrwJvz_c3Plh32Yxkqwff64oS_CcpfATOV8k/edit?usp=sharing

In [10]:
## first, unzip the combined.json.zip file
## then, run this code to load the unzipped json file and convert to a dataframe
## and convert some of the attributes from lists to values
## make sure to change the pathname if you need to
doj = pd.read_json("pset5_data/combined.json", lines = True)

## due to json, topics are in a list so remove them and concatenate with ;
doj['topics_clean'] = ["; ".join(topic) 
                      if len(topic) > 0 else "No topic" 
                      for topic in doj.topics]

## similarly with components
doj['components_clean'] = ["; ".join(comp) 
                           if len(comp) > 0 else "No component" 
                           for comp in doj.components]

## drop older columns from data
doj = doj[['id', 'title', 'contents', 'date', 'topics_clean', 
           'components_clean']].copy()

## 1. Tagging and sentiment scoring (16 points)

Focus on the following press release: `id` == "17-1204" about this pharmaceutical kickback prosecution: https://www.forbes.com/sites/michelatindera/2017/11/16/fentanyl-billionaire-john-kapoor-to-plead-not-guilty-in-opioid-kickback-case/?sh=21b8574d6c6c 

The `contents` column is the one we're treating as a document. You may need to to convert it from a pandas series to a single string.

We'll call the raw string of this press release `pharma`

In [11]:
doj.columns

Index(['id', 'title', 'contents', 'date', 'topics_clean', 'components_clean'], dtype='object')

In [12]:
## your code to subset to one press release and take the string
pharma = doj[doj.id == '17-1204']['contents'][4909]
pharma

'The founder and majority owner of Insys Therapeutics Inc., was arrested today and charged with leading a nationwide conspiracy to profit by using bribes and fraud to cause the illegal distribution of a Fentanyl spray intended for cancer patients experiencing breakthrough pain.\xa0"More than 20,000 Americans died of synthetic opioid overdoses last year, and millions are addicted to opioids. And yet some medical professionals would rather take advantage of the addicts than try to help them," said Attorney General Jeff Sessions. "This Justice Department will not tolerate this.\xa0 We will hold accountable anyone – from street dealers to corporate executives -- who illegally contributes to this nationwide epidemic.\xa0 And under the leadership of President Trump, we are fully committed to defeating this threat to the American people.”John N. Kapoor, 74, of Phoenix, Ariz., a current member of the Board of Directors of Insys, was arrested this morning in Arizona and charged with RICO conspi

### 1.1 part of speech tagging (3 points)

A. Preprocess the `pharma` press release to remove all punctuation / digits (so can use `.isalpha()` to subset)

B. With the preprocessed press release from part A, use the part of speech tagger within nltk to tag all the words in that one press release with their part of speech. 

C. Using the output from B, extract the adjectives and sort those adjectives from most occurrences to fewest occurrences. Print a dataframe with the 5 most frequent adjectives and their counts in the `pharma` release. See here for a list of the names of adjectives within nltk: https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/

**Resources**:

- Documentation for .isalpha(): https://www.w3schools.com/python/ref_string_isalpha.asp
- `process_step1` function here has an example of tokenizing and filtering to words where .isalpha() is true: 
https://github.com/rebeccajohnson88/PPOL564_slides_activities/blob/main/activities/fall_22/solutions/09_textasdata_partII_topicmodeling_solution.ipynb 
- Part of speech tagging section of this code: 
https://github.com/rebeccajohnson88/PPOL564_slides_activities/blob/main/activities/fall_22/solutions/08_textasdata_partI_textmining_solutions.ipynb



In [13]:
## your code here to restrict to alpha
# Preprocess the pharma press release to remove all punctuation / digits (so can use .isalpha() to subset)
pharma_words = [word for word in pharma.split(" ") if word.isalpha()]

In [14]:
## your code here for part of speech tagging
pharma_tokens = pos_tag(pharma_words)
adj_tags = ['JJ', 'JJR', 'JJS']
pharma_adjs = [token[0] for token in pharma_tokens if token[1] in adj_tags]
freq_adjs=pd.DataFrame(nltk.FreqDist(pharma_adjs).items(), columns=['word', 'frequency'])
freq_adjs.sort_values(['frequency'], ascending=False).head(5)

Unnamed: 0,word,frequency
9,former,8
0,nationwide,4
28,addictive,3
25,opioid,3
8,other,3


## 1.2 named entity recognition (3 points)



A. Using the original `pharma` press release (so the one before stripping punctuation/digits), use spaCy to extract all named entities from the press release.

B. Print the unique named entities with the tag: `LAW`. Here's some background on what RICO means: https://www.justia.com/criminal/docs/rico/ 

**Resources**:
- For parts A and B: named entity recognition part of this code: 
https://github.com/rebeccajohnson88/PPOL564_slides_activities/blob/main/activities/fall_22/solutions/08_textasdata_partI_textmining_solutions.ipynb

In [15]:
len(pharma)

9252

In [16]:
## your code here 
#A
pharma_spacy = nlp(pharma).ents

In [17]:
#B
law_ents = [ent for ent in pharma_spacy if ent.label_ == "LAW"]
np.unique(list(law_ents))

array([RICO, the Controlled Substances Act, RICO], dtype=object)

C. You want to extract the possible sentence lengths the CEO is facing; pull out the named entities with (1) the label `DATE` and (2) that contain the word year or years. Print these named entities.

**Hint:**  
You may want to use the `re` module for the second part.

In [18]:
len(pharma_spacy)

144

In [19]:
## your code here
len_ents = [ent for ent in pharma_spacy if (ent.label_ == "DATE") & ( str.__contains__(ent.text, 'year'))]
len_ents

[last year, three years, three years]

D. Parse the pharma string at the sentence level. Note that this involves more than just splitting on each `.`; for full credit, add at least one additional delimiter that marks the end of the sentence.

Then, using those sentences, pull and print the original sentences from the press releases where those year lengths are mentioned. Describe in your own words (1 sentence) what length of sentence (prison) and probation (supervised release) the CEO may be facing if convicted after this indictment (if there are multiple lengths mentioned describe the maximum). 

**Hint:**  
You may want to use re.search or re.findall 

**Resources**:

- re.search and re.findall examples here for filtering to ones containing year (multiple approaches; some need not involve `re`): 
https://github.com/rebeccajohnson88/PPOL564_slides_activities/blob/main/activities/fall_22/solutions/07_regex_solutions.ipynb


In [20]:
## your code here
pharma_splitted = re.split(r'[,.]', pharma)
pattern = "|".join(str(v) for v in len_ents)
sentence_w_year = [s for s in pharma_splitted if re.search(pattern, s) != None]
sentence_w_year
# re.search(pattern, pharma_splitted)

['000 Americans died of synthetic opioid overdoses last year',
 ' three years of supervised release and a fine of $250',
 ' three years of supervised release and a $25']

## 1.3 sentiment analysis  (10 points)

- Sentiment analysis section of this script: 
https://github.com/rebeccajohnson88/PPOL564_slides_activities/blob/main/activities/fall_22/solutions/08_textasdata_partI_textmining_solutions.ipynb


A. Subset the press releases to those labeled with one of three topics via `topics_clean`: Civil Rights, Hate Crimes, and Project Safe Childhood. We'll call this `doj_subset` going forward and it should have 717 rows.



In [21]:
## your code here for subsetting
topics = ['Civil Rights', 'Hate Crimes', 'Project Safe Childhood']
doj_subset = doj[doj.topics_clean.isin(topics)].reset_index(drop=True)
len(doj_subset)

717

B. Write a function that takes one press release string as an input and:

- Removes named entities from each press release string (**Hint:** you may want to use `re.sub` with an or condition)
- Scores the sentiment of the entire press release using the `SentimentIntensityAnalyzer` and `polarity_scores`
- Returns the length-four (negative, positive, neutral, compound) sentiment dictionary (any order is fine)

Apply that function to each of the press releases in `doj_subset`. 

**Hints**: 

- I used a function + list comprehension to execute and it takes about 30 seconds on my local machine; if it's taking a very long time, you may want to check your code for inefficiencies. If you can't fix those, for partial credit on this part/full credit on remainder, you can take a small random sample of the 717


In [22]:
# your code here defining the function
def press_sen_scores(row):

    content = str(row.contents)    
    sim_spacy = nlp(content)
    ents_pattern = "|".join(str(v) for v in sim_spacy.ents)
    contents_sub = re.sub(ents_pattern, "", content)

    sent_obj = SentimentIntensityAnalyzer()

    return sent_obj.polarity_scores(contents_sub)

In [23]:
# your code here executing the function
sam = doj_subset.sample(n=1)
type(press_sen_scores(sam))

dict

C. Add the four sentiment scores to the `doj_subset` dataframe to create a dataframe: `doj_subset_wscore`. Sort from highest neg to lowest neg score (so most negative to least negative) and print the `id`, `contents`, and `neg` columns of the two most negative press releases. 

Notes:

- Don't worry if your sentiment score differs slightly from our output on GitHub; differences in preprocessing can lead to diff scores

In [24]:
doj_subset_wscores = doj_subset.copy(deep=True)
doj_subset_wscores['contents'] = doj_subset_wscores['contents'].str.replace("[\)\(]",'', regex=True)
doj_subset_wscores['scores'] = doj_subset_wscores.apply(press_sen_scores, axis = 1)

In [25]:
doj_subset_wscores = pd.concat([doj_subset_wscores, pd.json_normalize(doj_subset_wscores['scores'])], axis=1)
doj_subset_wscores = doj_subset_wscores.drop(['scores'], axis=1)
doj_subset_wscores = doj_subset_wscores.sort_values(by=['neg'], ascending=False).reset_index(drop=True)
doj_subset_wscores[['id', 'contents', 'neg']].head(n=2)

Unnamed: 0,id,contents,neg
0,14-248,"The Department of Justice announced that this morning John W. Ng, 58, of Albuquerque, N.M., made his initial appearance in federal court on a criminal complaint charging him with a hate crime offense. This charge is related to anti-Semitic threats Ng made against a Jewish woman who owns and operates the Nosh Jewish Delicatessen and Bakery in Albuquerque. Ng was arrested by the FBI on March 7, 2014, based on a criminal complaint alleging that he interfered with the victim’s federally protected rights by threatening her and interfering with her business because of her religion. According to the criminal complaint, between Jan. 22, 2014, and Feb. 8, 2014, Ng allegedly posted threatening anti-Semitic notes on and in the vicinity of the victim’s business. A criminal complaint merely establishes probable cause, and Ng is presumed innocent unless proven guilty. If convicted on the offense charged in the criminal complaint, Ng faces a maximum statutory penalty of one year in prison. This matter was investigated by the Albuquerque Division of the FBI and is being prosecuted by Assistant U.S. Attorney Mark T. Baker of the U.S. Attorney’s Office for the District of New Mexico and Trial Attorney AeJean Cha of the U.S. Department of Justice’s Civil Rights Division.",0.316
1,13-312,"John Hall, 27, an Aryan Brotherhood member and inmate at the Federal Correctional Institution FCI in Seagoville, Texas, was sentenced today by U.S. District Judge Reed O’Connor after pleading guilty to violating the Matthew Shepard and James Byrd Jr. Hate Crimes Prevention Act stemming from his assault of a fellow inmate, whom he believed to be gay, the Department of Justice announced. Hall assaulted his fellow inmate with a dangerous weapon, causing bodily injury to the victim on Dec. 20, 2011. Hall was sentenced to serve 71 months in prison to be served consecutively with the sentence he is currently serving. The assault occurred on Dec. 20, 2011, inside the FCI Seagoville when Hall targeted and attacked the victim, a fellow inmate, because he believed the victim was gay or involved in a sexual relationship with another male inmate. Hall repeatedly punched, kicked and stomped on the victim’s face with his shod feet, a dangerous weapon, while yelling a homophobic slur. The victim lost consciousness during the assault and suffered multiple lacerations to his face. The victim also sustained a fractured eye socket, lost a tooth, fractured other teeth and was treated at a hospital for the injuries he sustained during Hall’s unprovoked attack. Hall pleaded guilty to violating the Matthew Shepard and James Byrd Jr. Hate Crimes Prevention Act on Nov. 8, 2012. “Brutality and violence based on sexual orientation has no place in a civilized society,” said Thomas E. Perez, Assistant Attorney General for the Civil Rights Division. “The Justice Department is committed to using all the tools in our law enforcement arsenal, including the Matthew Shepard and James Byrd Jr. Hate Crimes Prevention Act, to prosecute acts motivated by hate.” “This prosecution sends a clear message that this office, in partnership with attorneys in the department’s Civil Rights Division, will prioritize and aggressively prosecute hate crimes and others civil rights violations in North Texas,” said U.S. Attorney Sarah R. Saldaña of the Northern District of Texas. This case was investigated by the FBI Dallas Division. The case was prosecuted by Assistant U.S. Attorney Errin Martin and Trial Attorney Adriana Vieco of the Civil Rights Division.",0.306


In [26]:
doj_subset_wscores.columns

Index(['id', 'title', 'contents', 'date', 'topics_clean', 'components_clean',
       'neg', 'neu', 'pos', 'compound'],
      dtype='object')

D. With the dataframe from part C, find the mean compound sentiment score for each of the three topics in `topics_clean` using group_by and agg.

E. Add a 1 sentence interpretation of why we might see the variation in scores (remember that compound is a standardized summary where -1 is most negative; +1 is most positive)


In [27]:
## agg and find the mean compound score by topic
doj_subset_wscores.groupby(by=['topics_clean']).agg(np.mean)['compound']


topics_clean
Civil Rights             -0.098619
Hate Crimes              -0.934388
Project Safe Childhood   -0.666987
Name: compound, dtype: float64

In [28]:
# YOUR INTERPRETATION HERE


In the result we can found there's a huge difference between these 3 types. 
Sepecially the super negetive value in the **Hate Crimes**. Out of my suprise, the **Civil Right** category have a nearly neutural outcome. In my hypothesis, the **Project Safe Childhood** should be the least negative one. Afterall, this type this related to children, I thought the wording in contents may try to avoid the negative phrasing.


# 2. Topic modeling (25 points)

For this question, use the `doj_subset_wscores` data that is restricted to civil rights, hate crimes, and project safe childhood and with the sentiment scores added


## 2.1 Preprocess the data by removing stopwords, punctuation, and non-alpha words (5 points)

A. Write a function that:

- Takes in a single raw string in the `contents` column from that dataframe
- Does the following preprocessing steps:

    - Converts the words to lowercase
    - Removes stopwords, adding the custom stopwords in the code cell below to the default stopwords list
    - Only retains alpha words (so removes digits and punctuation)
    - Only retains words 4 characters or longer
    - Uses the snowball stemmer from nltk to stem

- Returns a joined preprocessed string (so if press release is something like "The CEO was indicted" it might return "ceo indict" 
    
B. Use `apply` or list comprehension to execute that function and create a new column in the data called `processed_text`. Note: there will be a deduction if your code uses a non-list comprehension for loop that uses append.
    
C. Print the `id`, `contents`, and `processed_text` columns for the following press releases:

id = 16-718 (this case: https://www.seattletimes.com/nation-world/doj-miami-police-reach-settlement-in-civil-rights-case/)

id = 16-217 (this case: https://www.wlbt.com/story/32275512/three-mississippi-correctional-officers-indicted-for-inmate-assault-and-cover-up/)
    
**Resources**:

- Here's code examples for the snowball stemmer: https://www.geeksforgeeks.org/snowball-stemmer-nlp/
- Here's code with topic modeling steps: 
https://github.com/rebeccajohnson88/PPOL564_slides_activities/blob/main/activities/fall_22/solutions/09_textasdata_partII_topicmodeling_solution.ipynb

In [29]:
custom_doj_stopwords = ["civil", "rights", "division", "department", "justice",
                        "office", "attorney", "district", "case", "investigation", "assistant",
                       "trial", "assistance", "assist"]

In [30]:
## your code defining a text processing function
def contents_preprocess(content):
    # Converts the words to lowercase
    sam_str = str(content).lower()

    # Removes stopwords, adding the custom stopwords in the code cell below to the default stopwords list
    list_stopwords = stopwords.words("english") + custom_doj_stopwords
    str_list = [word for word in word_tokenize(sam_str) if word not in list_stopwords]

    stop_pattern = " | ".join(list_stopwords)
    sam_str = re.sub(stop_pattern, " ", sam_str)

    # Only retains alpha words (so removes digits and punctuation)
    str_list = [word for word in str_list if word.isalpha()]

    # Only retains words 4 characters or longer
    str_list = [word for word in str_list if len(word)>3]

    # Uses the snowball stemmer from nltk to stem
    stemmer = SnowballStemmer('english')
    sam_str = " ".join([stemmer.stem(v) for v in str_list])

    return sam_str

In [31]:
## your code executing the function
doj_subset_wscores['processed_text'] = doj_subset_wscores.contents.apply(contents_preprocess)

In [32]:
doj_subset_wscores.columns

Index(['id', 'title', 'contents', 'date', 'topics_clean', 'components_clean',
       'neg', 'neu', 'pos', 'compound', 'processed_text'],
      dtype='object')

In [33]:
## your code showing the examples
doj_subset_wscores[doj_subset_wscores.id.isin(['16-718', '16-217'])][[ 'id', 'contents', 'processed_text' ]]


Unnamed: 0,id,contents,processed_text
2,16-718,"In a nine-count indictment unsealed today, two Mississippi correctional officers were charged with beating an inmate and a third was charged with helping to cover it up. The indictment charged Lawardrick Marsher, 28, and Robert Sturdivant, 47, officers at Mississippi State Penitentiary, in Parchman, Mississippi, with a beating that included kicking, punching and throwing the victim to the ground. Marsher and Sturdivant were charged with violating the right of K.H., a convicted prisoner, to be free from cruel and unusual punishment. Sturdivant was also charged with failing to intervene while Marsher was punching and beating K.H. The indictment alleges that their actions involved the use of a dangerous weapon and resulted in bodily injury to the victim. A third officer, Deonte Pate, 23, was charged along with Marsher and Sturdivant for conspiring to cover up the beating. The indictment alleges that all three officers submitted false reports and that all three lied to the FBI. If convicted, Marsher and Sturdivant face a maximum sentence of 10 years in prison on the excessive force charges. Each of the three officers faces up to five years in prison on the conspiracy and false statement charges, and up to 20 years in prison on the false report charges. An indictment is merely an accusation, and the defendants are presumed innocent unless and until proven guilty. This case is being investigated by the FBI’s Jackson Division, with the cooperation of the Mississippi Department of Corrections. It is being prosecuted by Assistant U.S. Attorney Robert Coleman of the Northern District of Mississippi and Trial Attorney Dana Mulhauser of the Civil Rights Division’s Criminal Section. Marsher Indictment",indict unseal today mississippi correct offic charg beat inmat third charg help cover indict charg lawardrick marsher robert sturdiv offic mississippi state penitentiari parchman mississippi beat includ kick punch throw victim ground marsher sturdiv charg violat right convict prison free cruel unusu punish sturdiv also charg fail interven marsher punch beat indict alleg action involv danger weapon result bodili injuri victim third offic deont pate charg along marsher sturdiv conspir cover beat indict alleg three offic submit fals report three lie convict marsher sturdiv face maximum sentenc year prison excess forc charg three offic face five year prison conspiraci fals statement charg year prison fals report charg indict mere accus defend presum innoc unless proven guilti investig jackson cooper mississippi correct prosecut robert coleman northern mississippi dana mulhaus crimin section marsher indict
688,16-217,"The Justice Department has reached a comprehensive settlement agreement with the city of Miami and the Miami Police Department MPD resolving the Justice Department’s investigation of officer-involved shootings by MPD officers, announced Principal Deputy Assistant Attorney General Vanita Gupta, head of the Justice Department’s Civil Rights Division and U.S. Attorney Wifredo A. Ferrer of the Southern District of Florida. The settlement, which was approved by Miami’s city commission today and will go into effect when the agreement is signed by all parties, resolves claims stemming from the Justice Department’s investigation into officer-involved shootings by MPD officers, which was conducted under the Violent Crime Control and Law Enforcement Act of 1994. The investigation’s findings, issued in July 2013, identified a pattern or practice of excessive use of force through officer-involved shootings in violation of the Fourth Amendment of the Constitution. The city’s compliance with the settlement will be monitored by an independent reviewer, former Tampa, Florida, Police Chief Jane Castor. Under the settlement agreement, the city will implement comprehensive reforms to ensure constitutional policing and support public trust. The settlement agreement is designed to minimize officer-involved shootings and to more effectively and quickly investigate officer-involved shootings that do occur, through measures that include: “This settlement represents a renewed commitment by the city of Miami and Chief Rodolfo Llanes to provide constitutional policing for Miami residents and to protect public safety through sustainable reform,” said Principal Deputy Assistant Attorney General Gupta. “The agreement will help to strengthen the relationship between the MPD and the communities they serve by improving accountability for officers who fire their weapons unlawfully, and provides for community participation in the enforcement of this agreement.” “Today's agreement is the result of a joint effort between the Department of Justice and the City of Miami to ensure that the Miami Police Department continues its efforts to make our community safe while protecting the sacred Constitutional rights of all of our citizens,” said U.S. Attorney Ferrer. “Through oversight and communication, the agreement seeks to make permanent the positive changes that former Chief Orosa and Chief Llanes have made, and we applaud the City Commission’s vote.” The settlement agreement builds upon important reforms implemented by the city since the Justice Department issued its findings, including: The investigation was conducted by attorneys and staff from the Civil Rights Division’s Special Litigation Section and the Civil Division of the U. S. Attorney’s Office of the Southern District of Florida.",reach comprehens settlement agreement citi miami miami polic resolv shoot offic announc princip deputi general vanita gupta head wifredo ferrer southern florida settlement approv miami citi commiss today effect agreement sign parti resolv claim stem shoot offic conduct violent crime control enforc find issu juli identifi pattern practic excess forc shoot violat fourth amend constitut citi complianc settlement monitor independ review former tampa florida polic chief jane castor settlement agreement citi implement comprehens reform ensur constitut polic support public trust settlement agreement design minim shoot effect quick investig shoot occur measur includ settlement repres renew commit citi miami chief rodolfo llane provid constitut polic miami resid protect public safeti sustain reform said princip deputi general gupta agreement help strengthen relationship communiti serv improv account offic fire weapon unlaw provid communiti particip enforc today agreement result joint effort citi miami ensur miami polic continu effort make communiti safe protect sacr constitut citizen said ferrer oversight communic agreement seek make perman posit chang former chief orosa chief llane made applaud citi commiss settlement agreement build upon import reform implement citi sinc issu find includ conduct attorney staff special litig section southern florida


## 2.2 Create a document-term matrix from the preprocessed press releases and to explore top words (5 points)

A. Use the `create_dtm` function I provide (alternately, feel free to write your own!) and create a document-term matrix using the preprocessed press releases; make sure metadata contains the following columns: `id`, `compound` sentiment column you added, and the `topics_clean` column

B. Print the top 10 words for press releases with compound sentiment in the top 5% (so the most positive sentiment)

C. Print the top 10 words for press releases with compound sentiment in the bottom 5% (so the most negative sentiment)

**Hint**: for these, remember the pandas quantile function from pset two.  

D. Print the top 10 words for press releases in each of the three `topics_clean`

For steps B - D, to receive full credit, write a function `get_topwords` that helps you avoid duplicated code when you find top words for the different subsets of the data. There are different ways to structure it but one way is to feed it subsetted data (so data subsetted to one topic etc.) and for it to get the top words for that subset.

**Resources**:

- Here contains an example of applying the create_dtm function: 
https://github.com/rebeccajohnson88/PPOL564_slides_activities/blob/main/activities/fall_22/solutions/09_textasdata_partII_topicmodeling_solution.ipynb


In [34]:
def create_dtm(list_of_strings, metadata):
    vectorizer = CountVectorizer(lowercase = True)
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), 
        columns=vectorizer.get_feature_names())
    dtm_dense_named_withid = pd.concat([metadata.reset_index(drop=True).add_prefix('metadata_'), dtm_dense_named], axis = 1)
    return dtm_dense_named_withid

In [35]:
## your code 
dtm_doj = create_dtm(list_of_strings = doj_subset_wscores.processed_text, 
                     metadata=doj_subset_wscores[[ 'id', 'compound', 'topics_clean']])
dtm_doj.metadata_compound

0     -0.9950
1     -0.9983
2     -0.9964
3     -0.9986
4     -0.9968
        ...  
712    0.9854
713    0.7003
714    0.8519
715    0.7717
716    0.8481
Name: metadata_compound, Length: 717, dtype: float64

In [36]:
def get_topwords(subset, top_n):
    top_terms = subset[[col for col in subset.columns
                      if "metadata" not in col]].sum(axis = 0)

    return top_terms.sort_values(ascending = False).head(top_n)


In [37]:
#BPrint the top 10 words for press releases with compound sentiment in the top 5% (so the most positive sentiment)

top5per = dtm_doj[ dtm_doj.metadata_compound > dtm_doj.metadata_compound.quantile(0.95)]

get_topwords(top5per, 10)

agreement    171
state        119
enforc       116
ensur        105
disabl       102
student       86
servic        86
court         85
general       82
communiti     81
dtype: int64

In [38]:
# C. Print the top 10 words for press releases with compound sentiment in the bottom 5% (so the most negative sentiment)

bottom5per = dtm_doj[ dtm_doj.metadata_compound < dtm_doj.metadata_compound.quantile(0.05)]

get_topwords(bottom5per, 10)

assault     190
victim      166
crime       166
hate        128
defend      124
offic       111
sentenc     104
anderson     93
guilti       92
charg        92
dtype: int64

In [39]:
# D. Print the top 10 words for press releases in each of the three topics_clean
topics = list(dtm_doj.metadata_topics_clean.unique())
for topic in topics:
    subset = dtm_doj[ dtm_doj.metadata_topics_clean == topic]
    print(topic + ':')
    print(get_topwords(subset, 10))
    print('\n')


Hate Crimes:
victim      590
crime       533
prosecut    476
hate        472
defend      459
sentenc     455
charg       452
guilti      430
feder       426
said        424
dtype: int64


Civil Rights:
offic        627
hous         620
discrimin    541
enforc       531
disabl       509
said         497
feder        475
violat       470
state        443
general      408
dtype: int64


Project Safe Childhood:
child          1018
exploit         698
sexual          570
safe            476
childhood       472
project         472
pornographi     447
children        416
crimin          404
prosecut        374
dtype: int64




## 2.3 Estimate a topic model using those preprocessed words (5 points)

A. Going back to the preprocessed words from part 2.1, estimate a topic model with 3 topics, since you want to see if the unsupervised topic models recover different themes for each of the three manually-labeled topics (civil rights; hate crimes; project safe childhood). You have free rein over the other topic model parameters beyond the number of topics.

B. After estimating the topic model, print the top 15 words in each topic.

**Hints and Resources**:

- Same topic modeling resources linked to above
- Make sure to use the `random_state` argument within the model so that the numbering of topics does not move around between runs of your code

In [40]:
adoj_lda = doj_subset_wscores[doj_subset_wscores.processed_text != ""].copy(deep=True)

tokenized_text = [wordpunct_tokenize(one_text) 
                for one_text in 
                adoj_lda.processed_text]

In [41]:
## preprocess and estimate topicmod
### create dictionary
text_proc_dict = corpora.Dictionary(tokenized_text)
### filter dictionary- using 2% as bounds
text_proc_dict.filter_extremes(no_below = round(adoj_lda.shape[0]*0.02),
                             no_above = round(adoj_lda.shape[0]*0.98))

### create corpus from dictionary
corpus_fromdict_proc = [text_proc_dict.doc2bow(one_text) 
                       for one_text in tokenized_text]
### estimate model
n_topics = 3
ldamod_proc = gensim.models.ldamodel.LdaModel(corpus_fromdict_proc, 
                                         num_topics = n_topics, id2word=text_proc_dict, 
                                         passes=6, alpha = 'auto',
                                        per_word_topics = True, random_state = 1988)

### print topics and words
topics = ldamod_proc.print_topics(num_words = 15)
for topic in topics:
    print(topic)



(0, '0.016*"child" + 0.011*"exploit" + 0.009*"state" + 0.008*"disabl" + 0.008*"enforc" + 0.008*"children" + 0.008*"feder" + 0.007*"offic" + 0.007*"sexual" + 0.007*"general" + 0.007*"safe" + 0.007*"project" + 0.007*"agreement" + 0.007*"individu" + 0.007*"childhood"')
(1, '0.018*"sexual" + 0.017*"hous" + 0.013*"discrimin" + 0.012*"alleg" + 0.012*"charg" + 0.011*"child" + 0.009*"victim" + 0.009*"feder" + 0.009*"indict" + 0.008*"defend" + 0.008*"complaint" + 0.007*"court" + 0.007*"fair" + 0.007*"crimin" + 0.007*"inform"')
(2, '0.013*"victim" + 0.012*"sentenc" + 0.011*"prosecut" + 0.011*"charg" + 0.011*"said" + 0.011*"feder" + 0.010*"guilti" + 0.010*"defend" + 0.010*"crime" + 0.009*"hate" + 0.008*"prison" + 0.008*"investig" + 0.008*"today" + 0.008*"indict" + 0.008*"year"')


## 2.4 Add topics back to main data and explore correlation between manual labels and our estimated topics (10 points)

A. Extract the document-level topic probabilities. Within `get_document_topics`, use the argument `minimum_probability` = 0 to make sure all 3 topic probabilities are returned. Write an assert statement to make sure the length of the list is equal to the number of rows in the `doj_subset_wscores` dataframe

B. Add the topic probabilities to the `doj_subset_wscores` dataframe as columns and create a column, `top_topic`, that reflects each document to its highest-probability topic (eg topic 1, 2, or 3)

C. For each of the manual labels in `topics_clean` (Hate Crime, Civil Rights, Project Safe Childhood), print the breakdown of the % of documents with each top topic (so, for instance, Hate Crime has 246 documents-- if 123 of those documents are coded to topic_1, that would be 50%; and so on). 
**Hint**:    
pd.crosstab and normalize may be helpful: https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.crosstab.html

D. Using a couple press releases as examples, write a 1-2 sentence interpretation of why some of the manual topics map on more cleanly to an estimated topic than other manual topic(s)

**Resources**:

- End of this code (`Additional summaries of topics and documents`) contains example of how to use `get_document_topics` and other steps to add topic probabilities back to data: 
https://github.com/rebeccajohnson88/PPOL564_slides_activities/blob/main/activities/fall_22/solutions/09_textasdata_partII_topicmodeling_solution.ipynb
- If you're getting errors, use shape, len, and other commands to check the dimensionality of things at different steps since documents may be dropped if they contain no words post-processing 

In [42]:
## your code here to get doc-level topic probabilities 
topic_probs_bydoc =[ldamod_proc.get_document_topics(item, minimum_probability = 0 ) for item in corpus_fromdict_proc]
 
len(topic_probs_bydoc) == len(doj_subset_wscores)

True

In [43]:
## your code here to add those topic probabilities to the dataframe
one_list_tup = topic_probs_bydoc[0]

## create a long for dataframe by flattening the list
topic_probs_bydoc_long = pd.DataFrame([t for lst in topic_probs_bydoc for t in lst],
                                     columns = ['topic', 'probability'])

## add id var- we're repeating each id in the original data k times
## for the number of topics
topic_probs_bydoc_long['doc_id'] = list(np.concatenate([[one_id] * 
                                    n_topics for one_id in doj_subset_wscores.id]).flat)

## pivot to wide format
topic_probs_bydoc_wide = pd.pivot_table(topic_probs_bydoc_long, index = ['doc_id'],
                        columns = ['topic']).reset_index().reset_index(drop = True)
topic_probs_bydoc_wide.columns = ['doc_id'] + ["topic_" + str(i) for i in np.arange(0, n_topics)]

## merge with original data using doc id
topic_wmeta = pd.merge(topic_probs_bydoc_wide,
                      doj_subset_wscores,
                      left_on = 'doc_id',
                      right_on = 'id')

## create indicator for listing's top topic
topic_wmeta['toptopic'] = topic_wmeta[[col for col in topic_wmeta.columns if 
                                    "topic_" in col]].idxmax(axis=1)

topic_wmeta.groupby('toptopic').agg({'compound': np.mean})


Unnamed: 0_level_0,compound
toptopic,Unnamed: 1_level_1
topic_0,0.004039
topic_1,-0.603468
topic_2,-0.929828


In [44]:
topic_wmeta.columns

Index(['doc_id', 'topic_0', 'topic_1', 'topic_2', 'id', 'title', 'contents',
       'date', 'topics_clean', 'components_clean', 'neg', 'neu', 'pos',
       'compound', 'processed_text', 'toptopic'],
      dtype='object')

In [45]:
## your code here to summarize the topic proportions for each of the topics_clean 
# print the breakdown of the % of documents with each top topic (so, for instance, Hate Crime has 246 documents-- if 123 of those documents are coded to topic_1, that would be 50%; and so on)

pd.crosstab(topic_wmeta.topics_clean, topic_wmeta.toptopic, normalize='index')

toptopic,topic_0,topic_1,topic_2
topics_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Civil Rights,0.538206,0.13289,0.328904
Hate Crimes,0.0,0.00813,0.99187
Project Safe Childhood,0.8125,0.1875,0.0


# 3. Extend the analysis from unigrams to bigrams (9 points)

In the previous question, you found top words via a unigram representation of the text. Now, we want to see how those top words change with bigrams (pairs of words)

A. Using the `doj_subset_wscores` data and the `processed_text` column (so the words after stemming/other preprocessing), create a column in the data called `processed_text_bigrams` that combines each consecutive pairs of word into a bigram separated by an underscore. Eg:

"depart reach settlem" would become "depart_reach reach_settlem"

Do this by writing a function `create_bigram_onedoc` that takes in a single `processed_text` string and returns a string with its bigrams structured similarly to above example
 
**Hint**: there are many ways to solve but `zip` may be helpful: https://stackoverflow.com/questions/21303224/iterate-over-all-pairs-of-consecutive-items-in-a-list

B. Print the `id`, `processed_text`, and `processed_text_bigram` columns for press release with id = 16-217

In [46]:
doj_subset_wscores.columns

Index(['id', 'title', 'contents', 'date', 'topics_clean', 'components_clean',
       'neg', 'neu', 'pos', 'compound', 'processed_text'],
      dtype='object')

In [47]:
## your code here 
def create_bigram_onedoc(text): 
    t_list = text.split(" ")
    return " ".join([a+'_'+b for a, b in zip(t_list, t_list[1:])])

# create_bigram_onedoc(doj_subset_wscores.processed_text[1])
doj_subset_wscores["processed_text_bigrams"] = doj_subset_wscores.processed_text.apply(create_bigram_onedoc)
#B
doj_subset_wscores[doj_subset_wscores.id == "16-217"][["id", "processed_text", "processed_text_bigrams"]]

Unnamed: 0,id,processed_text,processed_text_bigrams
688,16-217,reach comprehens settlement agreement citi miami miami polic resolv shoot offic announc princip deputi general vanita gupta head wifredo ferrer southern florida settlement approv miami citi commiss today effect agreement sign parti resolv claim stem shoot offic conduct violent crime control enforc find issu juli identifi pattern practic excess forc shoot violat fourth amend constitut citi complianc settlement monitor independ review former tampa florida polic chief jane castor settlement agreement citi implement comprehens reform ensur constitut polic support public trust settlement agreement design minim shoot effect quick investig shoot occur measur includ settlement repres renew commit citi miami chief rodolfo llane provid constitut polic miami resid protect public safeti sustain reform said princip deputi general gupta agreement help strengthen relationship communiti serv improv account offic fire weapon unlaw provid communiti particip enforc today agreement result joint effort citi miami ensur miami polic continu effort make communiti safe protect sacr constitut citizen said ferrer oversight communic agreement seek make perman posit chang former chief orosa chief llane made applaud citi commiss settlement agreement build upon import reform implement citi sinc issu find includ conduct attorney staff special litig section southern florida,reach_comprehens comprehens_settlement settlement_agreement agreement_citi citi_miami miami_miami miami_polic polic_resolv resolv_shoot shoot_offic offic_announc announc_princip princip_deputi deputi_general general_vanita vanita_gupta gupta_head head_wifredo wifredo_ferrer ferrer_southern southern_florida florida_settlement settlement_approv approv_miami miami_citi citi_commiss commiss_today today_effect effect_agreement agreement_sign sign_parti parti_resolv resolv_claim claim_stem stem_shoot shoot_offic offic_conduct conduct_violent violent_crime crime_control control_enforc enforc_find find_issu issu_juli juli_identifi identifi_pattern pattern_practic practic_excess excess_forc forc_shoot shoot_violat violat_fourth fourth_amend amend_constitut constitut_citi citi_complianc complianc_settlement settlement_monitor monitor_independ independ_review review_former former_tampa tampa_florida florida_polic polic_chief chief_jane jane_castor castor_settlement settlement_agreement agreement_citi citi_implement implement_comprehens comprehens_reform reform_ensur ensur_constitut constitut_polic polic_support support_public public_trust trust_settlement settlement_agreement agreement_design design_minim minim_shoot shoot_effect effect_quick quick_investig investig_shoot shoot_occur occur_measur measur_includ includ_settlement settlement_repres repres_renew renew_commit commit_citi citi_miami miami_chief chief_rodolfo rodolfo_llane llane_provid provid_constitut constitut_polic polic_miami miami_resid resid_protect protect_public public_safeti safeti_sustain sustain_reform reform_said said_princip princip_deputi deputi_general general_gupta gupta_agreement agreement_help help_strengthen strengthen_relationship relationship_communiti communiti_serv serv_improv improv_account account_offic offic_fire fire_weapon weapon_unlaw unlaw_provid provid_communiti communiti_particip particip_enforc enforc_today today_agreement agreement_result result_joint joint_effort effort_citi citi_miami miami_ensur ensur_miami miami_polic polic_continu continu_effort effort_make make_communiti communiti_safe safe_protect protect_sacr sacr_constitut constitut_citizen citizen_said said_ferrer ferrer_oversight oversight_communic communic_agreement agreement_seek seek_make make_perman perman_posit posit_chang chang_former former_chief chief_orosa orosa_chief chief_llane llane_made made_applaud applaud_citi citi_commiss commiss_settlement settlement_agreement agreement_build build_upon upon_import import_reform reform_implement implement_citi citi_sinc sinc_issu issu_find find_includ includ_conduct conduct_attorney attorney_staff staff_special special_litig litig_section section_southern southern_florida


C. Use the create_dtm function and the `processed_text_bigrams` column to create a document-term matrix (`dtm_bigram`) with these bigrams. Keep the following three columns in the data: `id`, `topics_clean`, and `compound` 

D. Print the 
 (1) dimensions of the `dtm` matrix from question 2.2  and 
 (2) the dimensions of the `dtm_bigram` matrix. Comment on why the bigram matrix has more dimensions than the unigram matrix 

E. Find and print the 10 most prevelant bigrams for each of the three topics_clean using the `get_topwords` function from 2.2


In [48]:
## your code here
dtm_bigram = create_dtm(list_of_strings = doj_subset_wscores.processed_text_bigrams, 
                     metadata=doj_subset_wscores[[ 'id','topics_clean', 'compound' ]])
print("dimension of dtm_doj:", dtm_doj.shape)
print("\ndimension of dtm_bigram:", dtm_bigram.shape)

dimension of dtm_doj: (717, 6758)

dimension of dtm_bigram: (717, 71331)


In [49]:
get_topwords(dtm_bigram, 10)

plead_guilti         551
safe_childhood       472
project_safe         472
child_pornographi    446
hate_crime           378
year_prison          315
unit_state           289
child_exploit        280
fair_hous            253
special_agent        241
dtype: int64

# 4. Optional extra credit 1 (1 point)

You notice that the pharmaceutical kickbacks press release we analyzed in question 1 was for an indictment, and that in the original data, there's not a clear label for whether a press release outlines an indictment (charging someone with a crime), a conviction (convicting them after that charge either via a settlement or trial), or a sentencing (how many years of prison or supervised release a defendant is sentenced to after their conviction).

You want to see if you can identify pairs of press releases where one press release is from one stage (e.g., indictment) and another is from a different stage (e.g., a sentencing).

You decide that one way to approach is to find the pairwise string similarity between each of the processed press releases in `doj_subset`. There are many ways to do this, so Google for some approaches, focusing on ones that work well for entire documents rather than small strings.

Find the top two pairs (so four press releases total)-- do they seem like different stages of the same crime or just press releases covering similar crimes?

In [50]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
doj_similarity = doj_subset_wscores[[ 'id', 'processed_text', 'topics_clean']]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(doj_similarity.processed_text)
arr = X.toarray()
sim = cosine_similarity(arr)
sim = np.round(sim, 5)
sim[sim >= 1] = -1

sim_matrix = pd.DataFrame(sim, columns =doj_subset_wscores.id, index = doj_subset_wscores.id) 
sim_matrix.idxmax()

id
14-248      14-429
13-312     12-1334
16-718     16-1346
11-626      11-648
11-1531    11-1683
            ...   
16-539      16-379
15-667     15-1386
16-294      16-471
16-471      16-294
15-1559     17-057
Length: 717, dtype: object

In [52]:
sim_matrix.max().sort_values(ascending=False)[:5]

id
17-546     0.98046
17-231     0.98046
18-705     0.97681
17-1089    0.97681
17-260     0.97298
dtype: float64

In [53]:
doj_subset_wscores[doj_subset_wscores.id == '18-705'].contents

179    A man who solicited sexually explicit photos and videos from a minor and distributed them on the Internet was sentenced to 200 months in prison followed by 10 years of supervised release after pleading guilty to coercion and enticement of a minor to engage in sexual activity.     Acting Assistant Attorney General John P. Cronan of the Justice Department’s Criminal Division and U.S. Attorney David C. Weiss of the District of Delaware made the announcement after sentencing by U.S. District Judge Leonard P. Stark of the District of Delaware.   Justin R. Gulisano aka Emma Alexander Gulisano, 26, of Newark, New York, was charged on March 10, 2016, and pleaded guilty on Sept. 28, 2017.    According to admissions made in connection with the plea agreement, Gulisano met the victim online when the victim was 15 years old. Gulisano began requesting and received sexually explicit images and videos from the victim. Gulisano posted the sexually explicit videos of the victim on a pornography 

In [54]:
doj_subset_wscores[doj_subset_wscores.id.isin(['17-546', '17-231'])].contents

533    A Church Hill, Maryland, resident was sentenced today to 20 years in prison to be followed by a lifetime term of supervised release for enticement of a minor to engage in sexual activity and attempting to transfer obscene materials to a minor, announced Acting Assistant Attorney General Kenneth A. Blanco of the Justice Department’s Criminal Division and Acting U.S. Attorney Benjamin G. Greenberg of the Southern District of Florida.    Lee Robert Moore, 38, pleaded guilty March 1, 2017, before U.S. District Judge Daniel T. K. Hurley of the Southern District of Florida. Moore was employed by the U.S. Secret Service-Uniformed Division and was assigned to the White House at the time of his arrest on Nov. 9, 2015, and has remained in custody since that time. Moore has since been terminated from his Secret Service position.   According to admissions made in connection with his plea, Moore maintained a profile on the social media application “Meet24,” which provides a mobile-based plat

In [55]:
print(doj_subset_wscores[doj_subset_wscores.id == '18-705'].contents)
print(doj_subset_wscores[doj_subset_wscores.id == '17-1089'].contents)


179    A man who solicited sexually explicit photos and videos from a minor and distributed them on the Internet was sentenced to 200 months in prison followed by 10 years of supervised release after pleading guilty to coercion and enticement of a minor to engage in sexual activity.     Acting Assistant Attorney General John P. Cronan of the Justice Department’s Criminal Division and U.S. Attorney David C. Weiss of the District of Delaware made the announcement after sentencing by U.S. District Judge Leonard P. Stark of the District of Delaware.   Justin R. Gulisano aka Emma Alexander Gulisano, 26, of Newark, New York, was charged on March 10, 2016, and pleaded guilty on Sept. 28, 2017.    According to admissions made in connection with the plea agreement, Gulisano met the victim online when the victim was 15 years old. Gulisano began requesting and received sexually explicit images and videos from the victim. Gulisano posted the sexually explicit videos of the victim on a pornography 

I think both of the top matched pair are both belong to different phases of the same crime

# 5. Optional extra credit 2 (3 points)

Review the scraping code here: https://github.com/jbencina/dojreleases/blob/master/scraper.py
    
Write code to scrape press releases from more recent years from the DOJ website than the years available in the combined.json and produce a visualization of how top words or themes in the press releases changed across the Trump administration (2016-December 2020) versus Biden administration (January 2021 onwards)- you can do this in a .py file that you submit separately and just read in the data produced by that scraping

In [140]:
## your code here
#visual
# !pip install pyLDAvis
# import pyLDAvis.gensim as gensimvis
import pyLDAvis.gensim_models as gensimvis 
import pyLDAvis
import json


In [None]:
with open("/content/drive/MyDrive/DS_I/final.json") as fp:
    json_data = json.loads(''.join(line.strip() for line in fp))

# recent_doj = pd.read_json('final.json')
recent_doj = pd.read_json(json_data)

In [133]:
recent_doj["clean_date"] = recent_doj.DATE.str.split('T').str.get(0)
recent_doj["clean_date"] = pd.to_datetime(recent_doj.clean_date, format='%Y-%m-%d').dt.date
recent_doj["clean_date"].head(3)

0    2022-11-07
1    2022-11-07
2    2022-11-07
Name: clean_date, dtype: object

In [137]:
# recent_doj.DATE = recent_doj.DATE.dt.date
# trump_doj = recent_doj.copy()
trump_doj = recent_doj[(recent_doj.clean_date >= datetime.date(2016,1,1)) & (recent_doj.clean_date <=datetime.date(2020,12,31))]  
biden_doj = recent_doj[recent_doj.clean_date >= datetime.date(2021,1,1)]  

In [138]:
print(len(trump_doj), len(biden_doj))

6959 2532


In [139]:
trump_doj['processed_text'] = trump_doj.CONTENT.apply(contents_preprocess)
biden_doj['processed_text'] = biden_doj.CONTENT.apply(contents_preprocess)


In [150]:
def visualize_topic_model(tokenized_text,lda, n_topic=5):
    ## preprocess and estimate topicmod
    ### create dictionary
    text_proc_dict = corpora.Dictionary(tokenized_text)
    ### filter dictionary- using 2% as bounds
    text_proc_dict.filter_extremes(no_below = round(lda.shape[0]*0.02),
                                no_above = round(lda.shape[0]*0.98))

    ### create corpus from dictionary
    corpus_fromdict_proc = [text_proc_dict.doc2bow(one_text) 
                        for one_text in tokenized_text]
    ### estimate model
    n_topics = 5
    ldamod_proc = gensim.models.ldamodel.LdaModel(corpus_fromdict_proc, 
                                            num_topics = n_topics, id2word=text_proc_dict, 
                                            passes=6, alpha = 'auto',
                                            per_word_topics = True, random_state = 199082)

    ### print topics and words
    topics = ldamod_proc.print_topics(num_words = 15)
    for topic in topics:
        print(topic)
    return ldamod_proc, corpus_fromdict_proc, text_proc_dict



## topic mode of **Trump**

In [153]:
trump_lda = trump_doj[trump_doj.processed_text != ""].copy()

trump_tokenized_text = [wordpunct_tokenize(one_text) 
                for one_text in 
               trump_lda.processed_text]

In [None]:
trump_ldamod, trump_corpus_fromdic, trump_text_dict = visualize_topic_model(trump_tokenized_text,trump_lda, n_topic=5)

In [156]:
pyLDAvis.enable_notebook()
lda_display_proc = gensimvis.prepare(trump_ldamod, trump_corpus_fromdic, trump_text_dict)
pyLDAvis.display(lda_display_proc)

## topic mode of **Biden**

In [148]:
biden_lda = biden_doj[biden_doj.processed_text != ""].copy(deep=True)

biden_tokenized_text = [wordpunct_tokenize(one_text) 
                for one_text in 
               biden_lda.processed_text]


In [None]:
biden_ldamod, biden_corpus_fromdic, biden_text_dict = visualize_topic_model(biden_tokenized_text,biden_lda, n_topic=5)

In [152]:
pyLDAvis.enable_notebook()
lda_display_proc = gensimvis.prepare(biden_ldamod, biden_corpus_fromdic, biden_text_dict)
pyLDAvis.display(lda_display_proc)