# Dropbox dataframe preprocessing + preliminary overview

### Pre prep

In [14]:
# import/load packages

## install
%pip install pandas
%pip install chardet
%pip install cchardet


## import
import os
import pandas as pd
import chardet
import cchardet


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
# see directory
os. getcwd()

'/work/Bachelor'

### Check that dataframe structures are **proper**

#### dropbox_text

In [6]:
# load data
dropbox_text = pd.read_csv("/work/Bachelor/sample_data/dropbox/dropbox_text.csv")
print(dropbox_text)

         textID                                               text
0      18207159   It appears that right now is a pretty good ti...
1      18207168   A letter to parents of expat children <p> Par...
2      18207169   Chafee Steps Up Criticism of Raimondo 's Econ...
3      18207182   Personal hero and why : My dad , because he i...
4      18207185   Students named in the complaints that disqual...
...         ...                                                ...
13552  18745485   Following the onslaught of nearly universal d...
13553  18745486   1954 : American film star Marilyn Monroe ( 19...
13554  18745489   Artists file lawsuit against city of Atlanta ...
13555  18745490   MONTESANO , Wash . -- James Walker , 31 , of ...
13556  18745491   Out to the track for our KSFY/Taco John 's At...

[13557 rows x 2 columns]


#### dropbox_sources

In [None]:
# check encoding, moght not be UTF-8

# Path to your CSV file
file_path = "/work/Bachelor/sample_data/dropbox/dropbox_sources.csv"

# Detect encoding
with open(file_path, 'rb') as f:
    result = cchardet.detect(f.read())
    encoding = result['encoding']
    confidence = result['confidence']

print(f"Detected encoding: {encoding} (Confidence: {confidence})")

In [16]:
# load data
dropbox_sources = pd.read_csv("/work/Bachelor/sample_data/dropbox/dropbox_sources.csv", encoding='ISO-8859-1') # encoding was detected as 'ISO-8859-1' 
print(dropbox_sources)

          textID  X.words      date country               source  \
0       18207159     2321  17-05-01      US       Glide Magazine   
1       18207168     2362  17-05-01      US      Huffington Post   
2       18207169      343  17-05-01      US          GoLocalProv   
3       18207170      257  17-05-01      US             Engadget   
4       18207172      180  17-05-01      US     Northwest Herald   
...          ...      ...       ...     ...                  ...   
166834  18740662      304  17-05-31      JM  Jamaica Star Online   
166835  18742209      315  17-05-31      JM      Jamaica Gleaner   
166836  18742210      127  17-05-31      JM  Jamaica Star Online   
166837  18742211      288  17-05-31      JM  Jamaica Star Online   
166838  18741456      175  17-05-31      JM      Jamaica Gleaner   

                                                      url  \
0       https://glidemagazine.com/183826/griffin-golds...   
1       http://www.huffingtonpost.com/entry/a-letter-t...   


### Merge dataframes

In [17]:
# Merging the DataFrames on 'textID'
dropbox_text_sources_df = pd.merge(dropbox_text, dropbox_sources, on='textID', how='inner')  # 'inner' is default, only includes data from both 

# Display the merged DataFrame
print(dropbox_text_sources_df)

        textID                                               text  X.words  \
0     18207159   It appears that right now is a pretty good ti...     2321   
1     18207168   A letter to parents of expat children <p> Par...     2362   
2     18207169   Chafee Steps Up Criticism of Raimondo 's Econ...      343   
3     18207182   Personal hero and why : My dad , because he i...      509   
4     18207185   Students named in the complaints that disqual...     1103   
...        ...                                                ...      ...   
7169  18739345   Tonight <p> Family members , of a man killed ...     1135   
7170  18739347   NYPD Sergeant Pleads Not Guilty to Murder of ...     1330   
7171  18739348   Georgia Peach Prices Are Rising After the Sta...      206   
7172  18739349   A moving video of a group of Texas state troo...      792   
7173  18739351   The arrival of the first trailer for Star Tre...      506   

          date country                         source  \
0     

In [30]:
# filter articles to only have articles from CNN and Fox News
CNN_articles = dropbox_text_sources_df[dropbox_text_sources_df['source'] == 'CNN']
Fox_articles = dropbox_text_sources_df[dropbox_text_sources_df['source'] == 'Fox News']

# combine dfs into a new one
CNN_Fox_articles = pd.concat([CNN_articles, Fox_articles])

# display the new df containing only CNN and Fox News articles
print(CNN_Fox_articles)
len(CNN_Fox_articles) # 47 wiii

        textID                                               text  X.words  \
525   18247418   JUST WATCHED <h> Clinton : Misogyny played a ...     1124   
822   18269674   Trump said he is confident the bill will pass...     1214   
1129  18282852   Africa wealth report : Millionaire island Mau...      694   
1176  18284510   JUST WATCHED <h> House passes bill to replace...       86   
1203  18285388   JUST WATCHED <h> UK PM calls for general elec...      950   
1238  18286902   The Cook Political Report , a non-partisan ca...      378   
2015  18325788   The rare , important world of disease reversa...     1656   
2036  18326650   JUST WATCHED <h> Le Pen 's young voters : ' G...      791   
2082  18348909   JUST WATCHED <h> Was Trump 's Yates tweet wit...      750   
2132  18350694   Story highlights <p> Seoul ( CNN ) South Kore...      942   
2287  18356434   Story highlights <p> The emails were forwarde...      650   
2527  18377073   Even by the often bewildering standards of th..

47

In [35]:
# check how many articles there are from each source
source_counts = CNN_Fox_articles['source'].value_counts()
print(source_counts)

source
CNN         25
Fox News    22
Name: count, dtype: int64


### How to only include articles related to abortion

In [31]:
# Search for rows where the 'text' column contains the word 'abortion'
abortion_articles = CNN_Fox_articles[CNN_Fox_articles['text'].str.contains('abortion', case=False, na=False)]
print(abortion_articles)
len(abortion_articles) # there's 11 here

        textID                                               text  X.words  \
2934  18398688   Honoring the soundtrack of my mother 's life ...     2491   

          date country source  \
2934  17-05-12      US    CNN   

                                                    url  \
2934  http://www.cnn.com/2017/05/12/health/soundtrac...   

                                            title  
2934  Honoring the soundtrack of my mother's life  


1

In [32]:
# expanding search with more seach words
search_terms = r"abortion|reproductive rights|reproductive health care|pro-choice|pro-life|Roe v\. Wade|Dobbs v\. Jackson Women's Health Organization"

# Filter articles that contain any of the search terms
abortion_articles = CNN_Fox_articles[CNN_Fox_articles['text'].str.contains(search_terms, case=False, na=False, regex=True)]
# Display the filtered articles
print(abortion_articles)
len(abortion_articles) # there's 2 here compared to 1 when you just search for 'abortion'

        textID                                               text  X.words  \
822   18269674   Trump said he is confident the bill will pass...     1214   
2934  18398688   Honoring the soundtrack of my mother 's life ...     2491   

          date country source  \
822   17-05-04      US    CNN   
2934  17-05-12      US    CNN   

                                                    url  \
822   http://www.cnn.com/2017/05/04/politics/health-...   
2934  http://www.cnn.com/2017/05/12/health/soundtrac...   

                                                  title  
822   House Republicans pass bill to replace and rep...  
2934        Honoring the soundtrack of my mother's life  


2

In [36]:
# check how many articles there are from each source
source_counts = abortion_articles['source'].value_counts()
print(source_counts)

source
CNN    2
Name: count, dtype: int64


So 2 abortion articles from CNN, and no abortion articles form Fox News.

### How to see personal stories... tbd

### Not enough data in CNN_Fox_articles, so imma use dropbox_text_source_df for NLP

In [42]:
# searching for abortion articles
search_terms = r"abortion|reproductive rights|reproductive health care|pro-choice|pro-life|Roe v\. Wade|Dobbs v\. Jackson Women's Health Organization"

# Filter articles that contain any of the search terms
abortion_articles_fake = dropbox_text_sources_df[dropbox_text_sources_df['text'].str.contains(search_terms, case=False, na=False, regex=True)] #👺 change to the CNN_Fox_articles df. I used text_sources_df because there's no abortion articles among the CNN_Fox_articles

# Display the filtered articles
print(abortion_articles_fake)
len(abortion_articles_fake) # 95 compared to 87 when just searching for 'abortion'

        textID                                               text  X.words  \
56    18209165   The Future of Europe Hinges on a Face-Off in ...     6314   
63    18209808   INDIANAPOLIS -- Depending on who you ask , th...      659   
175   18224263   Sister Jerome leaves legacy of love at Centra...      680   
199   18225088   Share Article <p> We want to take the time du...      585   
231   18226773   LOS ANGELES -- Baby Groot was born at the end...      439   
...        ...                                                ...      ...   
6949  18713264   Christian Villagran Morales strolled a path i...     1264   
7055  18724477   On Monday , the tea party-aligned sophomore l...     1103   
7077  18725264   After A Day of Outrage , Kathy Griffin Apolog...      662   
7133  18737702   Two weeks into May , on a record-breaking 91-...     4253   
7154  18738651   When President Trump posted a misspelled  unp...      647   

          date country                  source  \
56    17-05-0

95

In [43]:
# save abortion_articles_fake as csv for NLP analysis

# Define the file path where you want to save the CSV
output_file_path = "/work/Bachelor/sample_data/dropbox/abortion_articles_fake.csv"

# Save the filtered DataFrame as a CSV file
abortion_articles_fake.to_csv(output_file_path, index=False, encoding='utf-8')

# Optional: Confirm the save by loading the file back
loaded_abortion_articles = pd.read_csv(output_file_path, encoding='utf-8')
print(loaded_abortion_articles.head())  # Display the first few rows of the loaded DataFrame


     textID                                               text  X.words  \
0  18209165   The Future of Europe Hinges on a Face-Off in ...     6314   
1  18209808   INDIANAPOLIS -- Depending on who you ask , th...      659   
2  18224263   Sister Jerome leaves legacy of love at Centra...      680   
3  18225088   Share Article <p> We want to take the time du...      585   
4  18226773   LOS ANGELES -- Baby Groot was born at the end...      439   

       date country                  source  \
0  17-05-01      US          The New Yorker   
1  17-05-01      US            nwitimes.com   
2  17-05-02      US        Portland Tribune   
3  17-05-02      US  PR Web (press release)   
4  17-05-02      US    Chicago Daily Herald   

                                                 url  \
0  http://www.newyorker.com/magazine/2017/05/08/t...   
1  http://www.nwitimes.com/news/local/govt-and-po...   
2  http://portlandtribune.com/pt/12-sports/357010...   
3  http://www.prweb.com/releases/2017/05/p