In [6]:
!pip install pyspark


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
import pyspark
import pandas as pd
import tensorflow as tf
from google.colab import files


from pyspark.sql import SparkSession

from pyspark import SparkContext

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
spark_context_sc = SparkContext(master='local[3]')
spark_session = SparkSession.builder.appName("Categorizer").getOrCreate()

In [11]:
df = spark_session.read.json("CategorizedNews.json")
df.head()

Row(authors='Melissa Jeltsen', category='CRIME', date='2018-05-26', headline='There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV', link='https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89', short_description='She left her husband. He killed their children. Just another day in America.')

In [12]:
df.count()
df = df.limit(10000)


In [13]:
df.groupby('category').count().sort('count', ascending=False).show()


+-------------+-----+
|     category|count|
+-------------+-----+
|     POLITICS| 3604|
|ENTERTAINMENT| 1906|
|   WORLD NEWS|  683|
| QUEER VOICES|  512|
|       COMEDY|  495|
| BLACK VOICES|  443|
|       SPORTS|  382|
|        MEDIA|  329|
|        WOMEN|  283|
|   WEIRD NEWS|  242|
|        CRIME|  201|
|     BUSINESS|  112|
|LATINO VOICES|  105|
|       IMPACT|   86|
|       TRAVEL|   76|
|     RELIGION|   76|
|        STYLE|   70|
|      PARENTS|   66|
|        GREEN|   66|
|         TECH|   65|
+-------------+-----+
only showing top 20 rows



In [14]:

from pyspark.sql import functions as fun
df = df.withColumn('description', fun.concat(fun.col('headline'),fun.lit(' '), fun.col('short_description')))

In [15]:

df = df.drop("headline", "link", "authors", "short_description", "date")
df.show()

+-------------+--------------------+
|     category|         description|
+-------------+--------------------+
|        CRIME|There Were 2 Mass...|
|ENTERTAINMENT|Will Smith Joins ...|
|ENTERTAINMENT|Hugh Grant Marrie...|
|ENTERTAINMENT|Jim Carrey Blasts...|
|ENTERTAINMENT|Julianna Margulie...|
|ENTERTAINMENT|Morgan Freeman 'D...|
|ENTERTAINMENT|Donald Trump Is L...|
|ENTERTAINMENT|What To Watch On ...|
|ENTERTAINMENT|Mike Myers Reveal...|
|ENTERTAINMENT|What To Watch On ...|
|ENTERTAINMENT|Justin Timberlake...|
|   WORLD NEWS|South Korean Pres...|
|       IMPACT|With Its Way Of L...|
|     POLITICS|Trump's Crackdown...|
|     POLITICS|'Trump's Son Shou...|
|     POLITICS|Edward Snowden: T...|
|     POLITICS|Booyah: Obama Pho...|
|     POLITICS|Ireland Votes To ...|
|     POLITICS|Ryan Zinke Looks ...|
|     POLITICS|Trump's Scottish ...|
+-------------+--------------------+
only showing top 20 rows



In [16]:

from pyspark.ml.feature import IDF
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover



In [17]:
import pyspark.sql.functions as check

df = df.where(check.col('description').isNotNull())

In [18]:
token_create = Tokenizer(inputCol='description',outputCol='tokens')
stopwordRemove = StopWordsRemover(inputCol='tokens',outputCol='removed_stopwords')
vector_create = CountVectorizer(inputCol='removed_stopwords',outputCol='extracted_features')
idf = IDF(inputCol='extracted_features',outputCol='vectorizedFeatures')

In [19]:
label_encoding = StringIndexer(inputCol='category',outputCol='category_number').fit(df)
label_encoding.transform(df).show(10)

+-------------+--------------------+---------------+
|     category|         description|category_number|
+-------------+--------------------+---------------+
|        CRIME|There Were 2 Mass...|           10.0|
|ENTERTAINMENT|Will Smith Joins ...|            1.0|
|ENTERTAINMENT|Hugh Grant Marrie...|            1.0|
|ENTERTAINMENT|Jim Carrey Blasts...|            1.0|
|ENTERTAINMENT|Julianna Margulie...|            1.0|
|ENTERTAINMENT|Morgan Freeman 'D...|            1.0|
|ENTERTAINMENT|Donald Trump Is L...|            1.0|
|ENTERTAINMENT|What To Watch On ...|            1.0|
|ENTERTAINMENT|Mike Myers Reveal...|            1.0|
|ENTERTAINMENT|What To Watch On ...|            1.0|
+-------------+--------------------+---------------+
only showing top 10 rows



In [20]:

# Assume label_encoding is your StringIndexer object
transformed_df = label_encoding.transform(df)

# Get unique categories
unique_categories = transformed_df.select('category_number', 'category').distinct().collect()
#name = transformed_df.select('category').distinct().collect()

In [21]:
# Print the unique categories
for row in unique_categories:
    print("Category Number", row.category_number, " Category Name", row.category)


Category Number 10.0  Category Name CRIME
Category Number 1.0  Category Name ENTERTAINMENT
Category Number 2.0  Category Name WORLD NEWS
Category Number 13.0  Category Name IMPACT
Category Number 0.0  Category Name POLITICS
Category Number 9.0  Category Name WEIRD NEWS
Category Number 5.0  Category Name BLACK VOICES
Category Number 8.0  Category Name WOMEN
Category Number 4.0  Category Name COMEDY
Category Number 3.0  Category Name QUEER VOICES
Category Number 6.0  Category Name SPORTS
Category Number 11.0  Category Name BUSINESS
Category Number 15.0  Category Name TRAVEL
Category Number 7.0  Category Name MEDIA
Category Number 19.0  Category Name TECH
Category Number 14.0  Category Name RELIGION
Category Number 21.0  Category Name SCIENCE
Category Number 12.0  Category Name LATINO VOICES
Category Number 22.0  Category Name EDUCATION
Category Number 25.0  Category Name COLLEGE
Category Number 18.0  Category Name PARENTS
Category Number 24.0  Category Name ARTS & CULTURE
Category Number

In [22]:
label_encoding.labels


['POLITICS',
 'ENTERTAINMENT',
 'WORLD NEWS',
 'QUEER VOICES',
 'COMEDY',
 'BLACK VOICES',
 'SPORTS',
 'MEDIA',
 'WOMEN',
 'WEIRD NEWS',
 'CRIME',
 'BUSINESS',
 'LATINO VOICES',
 'IMPACT',
 'RELIGION',
 'TRAVEL',
 'STYLE',
 'GREEN',
 'PARENTS',
 'TECH',
 'HEALTHY LIVING',
 'SCIENCE',
 'EDUCATION',
 'TASTE',
 'ARTS & CULTURE',
 'COLLEGE']

In [23]:
label_dict = {
    0.0: 'POLITICS',
    1.0: 'ENTERTAINMENT',
    2.0: 'WORLD NEWS',
    3.0: 'QUEER VOICES',
    4.0: 'COMEDY',
    5.0: 'BLACK VOICES',
    6.0: 'SPORTS',
    13.0: 'MEDIA',
    8.0: 'WOMEN',
    10.0: 'CRIME',
    11.0: 'BUSINESS',
    12.0: 'LATINO VOICES',
    13.0: 'IMPACT',
    14.0: 'RELIGION',
    15.0: 'TRAVEL',
    16.0: 'STYLE',
    18.0: 'PARENTS',
    19.0: 'TECH',
    20.0: 'HEALTHY LIVING',
    22.0: 'EDUCATION',
    23.0: 'TASTE',
    25.0: 'COLLEGE',
    17.0: 'GREEN',
    9.0: 'WEIRD NEWS',
    24.0: 'ARTS $ CULTURE',
    21.0: 'SCIENCE',
    7.0: 'MEDIA'
}


In [24]:

df = label_encoding.transform(df)
df.show(100)

+-------------+--------------------+---------------+
|     category|         description|category_number|
+-------------+--------------------+---------------+
|        CRIME|There Were 2 Mass...|           10.0|
|ENTERTAINMENT|Will Smith Joins ...|            1.0|
|ENTERTAINMENT|Hugh Grant Marrie...|            1.0|
|ENTERTAINMENT|Jim Carrey Blasts...|            1.0|
|ENTERTAINMENT|Julianna Margulie...|            1.0|
|ENTERTAINMENT|Morgan Freeman 'D...|            1.0|
|ENTERTAINMENT|Donald Trump Is L...|            1.0|
|ENTERTAINMENT|What To Watch On ...|            1.0|
|ENTERTAINMENT|Mike Myers Reveal...|            1.0|
|ENTERTAINMENT|What To Watch On ...|            1.0|
|ENTERTAINMENT|Justin Timberlake...|            1.0|
|   WORLD NEWS|South Korean Pres...|            2.0|
|       IMPACT|With Its Way Of L...|           13.0|
|     POLITICS|Trump's Crackdown...|            0.0|
|     POLITICS|'Trump's Son Shou...|            0.0|
|     POLITICS|Edward Snowden: T...|          

In [25]:
from pyspark.sql.functions import rand

df = df.orderBy(rand(seed=21))
train_df = df.limit(int(df.count() * 0.70))
test_df = df.exceptAll(train_df)

In [26]:
from pyspark.ml.classification import LogisticRegression
logistic_reg = LogisticRegression(featuresCol='vectorizedFeatures',labelCol='category_number')


In [27]:
from pyspark.ml import Pipeline
cat_pipeline = Pipeline(stages = [token_create, stopwordRemove, vector_create,idf,logistic_reg])


In [28]:
cat_pipeline.stages


Param(parent='Pipeline_0ba8f7b8e63b', name='stages', doc='a list of pipeline stages')

In [29]:
logistic_model = cat_pipeline.fit(train_df)

In [30]:
logistic_model
preds = logistic_model.transform(test_df)

In [31]:
preds.columns


['category',
 'description',
 'category_number',
 'tokens',
 'removed_stopwords',
 'extracted_features',
 'vectorizedFeatures',
 'rawPrediction',
 'probability',
 'prediction']

In [32]:
preds.select('description','rawPrediction', 'probability','category','category_number','prediction').show(10)


+--------------------+--------------------+--------------------+-------------+---------------+----------+
|         description|       rawPrediction|         probability|     category|category_number|prediction|
+--------------------+--------------------+--------------------+-------------+---------------+----------+
|Trump Claims With...|[8.35310736422872...|[0.36156315470111...|     POLITICS|            0.0|       1.0|
|94 Percent Of Hol...|[-0.5695281958545...|[0.00163890651875...|ENTERTAINMENT|            1.0|       1.0|
|Trump Threatens A...|[11.1176599704702...|[0.11389348926235...|   WORLD NEWS|            2.0|       2.0|
|This Ex-NFL Playe...|[1.93307018838857...|[0.01901316292647...|       SPORTS|            6.0|       1.0|
|Black Parkland St...|[5.80949539195597...|[0.13252920793733...| BLACK VOICES|            5.0|       5.0|
|Laura Ingraham An...|[-5.3165958182414...|[2.12902949132581...|        MEDIA|            7.0|       7.0|
|A Generation Of H...|[8.87330457160014...|[0.

In [33]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='category_number')
evaluator.evaluate(preds)*100


59.93828170582442

In [34]:
from pyspark.mllib.evaluation import MulticlassMetrics
logistic_reg_metric = MulticlassMetrics(preds['category_number','prediction'].rdd)

In [35]:
print("Accuracy ", logistic_reg_metric.accuracy)
print("precision ", logistic_reg_metric.precision(1.0))
print("f1Score ", logistic_reg_metric.fMeasure(1.0))
print("recall ", logistic_reg_metric.recall(1.0))


Accuracy  0.628
precision  0.7455516014234875
f1Score  0.6736334405144694
recall  0.6143695014662757


In [3]:
file_path = "/content/drive/MyDrive/dataset.csv"

In [4]:
import pandas as pd
df10 = pd.read_csv(file_path)

  df10 = pd.read_csv(file_path)


In [37]:
df2 = spark_session.read.csv(file_path)
df2.show(5)

+----+-------------------+--------------------+--------------------+--------------------+--------+-----------------+-----------------+--------------------+-------+--------+-----------+--------+--------+-------+--------------------+--------------------+----------+----------+--------------------+-----------+
| _c0|                _c1|                 _c2|                 _c3|                 _c4|     _c5|              _c6|              _c7|                 _c8|    _c9|    _c10|       _c11|    _c12|    _c13|   _c14|                _c15|                _c16|      _c17|      _c18|                _c19|       _c20|
+----+-------------------+--------------------+--------------------+--------------------+--------+-----------------+-----------------+--------------------+-------+--------+-----------+--------+--------+-------+--------------------+--------------------+----------+----------+--------------------+-----------+
|null|     published_time|                 url|               title|        

In [38]:
df2 = spark_session.read.option("header", "true").csv(file_path)

In [39]:
df2 = df2.withColumnRenamed("_c0", "number")

In [40]:
df2.show(5)

+------+-------------------+--------------------+--------------------+--------------------+--------+-----------------+-----------------+--------------------+-------+--------+-----------+--------+--------+-------+--------------------+--------------------+----------+----------+--------------------+-----------+
|number|     published_time|                 url|               title|           image_url|LangCode|         doc_tone|DomainCountryCode|            location|    Lat|     Lon|CountryCode|Adm1Code|Adm2Code|GeoType|     contextual_text|            GeoCoord|row_number|state_code|              source|   category|
+------+-------------------+--------------------+--------------------+--------------------+--------+-----------------+-----------------+--------------------+-------+--------+-----------+--------+--------+-------+--------------------+--------------------+----------+----------+--------------------+-----------+
|     0|2022-11-30 03:30:00|https://www.npr.o...|Twitter COVID mis...|

In [41]:
df2.count()

2877831

In [42]:
df2 = df2.fillna('null')


In [43]:
from pyspark.sql import functions as sf2
df2 = df2.withColumn('description', sf2.concat(sf2.col('title'),sf2.lit(' '), sf2.col('contextual_text')))

In [44]:
df2 = df2.select("description","category")
df2.show(5)

+--------------------+-----------+
|         description|   category|
+--------------------+-----------+
|Twitter COVID mis...|placeholder|
|City - led initia...|placeholder|
|Applications avai...|placeholder|
|UW - Stout resear...|placeholder|
|Voice of the Peop...|placeholder|
+--------------------+-----------+
only showing top 5 rows



In [45]:
prediction_data = logistic_model.transform(df2)

In [46]:
prediction_data.columns


['description',
 'category',
 'tokens',
 'removed_stopwords',
 'extracted_features',
 'vectorizedFeatures',
 'rawPrediction',
 'probability',
 'prediction']

In [47]:
prediction_data.select('description','rawPrediction', 'probability','category','prediction').show(10)


+--------------------+--------------------+--------------------+-----------+----------+
|         description|       rawPrediction|         probability|   category|prediction|
+--------------------+--------------------+--------------------+-----------+----------+
|Twitter COVID mis...|[-2.9183719094745...|[2.04954554534184...|placeholder|       8.0|
|City - led initia...|[7.16448183226753...|[0.62146821826662...|placeholder|       0.0|
|Applications avai...|[25.9974741266751...|[0.99999999820699...|placeholder|       0.0|
|UW - Stout resear...|[0.12212815487219...|[1.52607565710686...|placeholder|      13.0|
|Voice of the Peop...|[7.04688808222974...|[0.00135102871993...|placeholder|       8.0|
|Grand Forks Count...|[25.9120253863951...|[0.99999952242962...|placeholder|       0.0|
|Neighborhoods Eva...|[-12.499834485524...|[8.64206489871482...|placeholder|      13.0|
|Time for NIAA to ...|[18.1117054593245...|[0.99999686759861...|placeholder|       0.0|
|River Valley orga...|[21.858721

In [48]:
prediction_column = prediction_data.select('prediction')

In [49]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# define a user-defined function to map category_number to category name
map_label_udf = udf(lambda x: label_dict.get(x), StringType())

# add a new column 'predicted_category' by mapping the category_number column to category name
prediction_column = prediction_column.withColumn('predicted_category', map_label_udf(prediction_column.prediction))

# select only the predicted_category column
final_predictions = prediction_column.select('predicted_category')


In [50]:
final_predictions.show(2)

+------------------+
|predicted_category|
+------------------+
|             WOMEN|
|          POLITICS|
+------------------+
only showing top 2 rows



In [150]:
final_predictions.show(5)

+------------------+
|predicted_category|
+------------------+
|             WOMEN|
|          POLITICS|
|          POLITICS|
|            IMPACT|
|             WOMEN|
+------------------+
only showing top 5 rows



In [151]:
final_predictions.count()

2877831

In [52]:
df10.head()

Unnamed: 0.1,Unnamed: 0,published_time,url,title,image_url,LangCode,doc_tone,DomainCountryCode,location,Lat,...,CountryCode,Adm1Code,Adm2Code,GeoType,contextual_text,GeoCoord,row_number,state_code,source,category
0,0,2022-11-30 03:30:00,https://www.npr.org/2022/11/29/1139822833/twit...,Twitter COVID misinformation policy will no lo...,https://media.npr.org/assets/img/2022/11/29/ap...,eng,-5.711086,US,"Yeshiva University, New York, United States",40.852,...,US,USNY,NY061,3,virus stay folks do not cede the town square t...,POINT(-73.9285 40.852),1,NY,gdelt-bq.covid19.onlinenewsgeo,placeholder
1,1,2022-11-29 15:15:00,https://bismarcktribune.com/news/local/city-le...,City - led initiative takes aim at homelessnes...,https://bloximages.chicago2.vip.townnews.com/b...,eng,-1.504425,US,"Bismarck, North Dakota, United States",46.8083,...,US,USND,ND015,3,to services benefiting people in need the idea...,POINT(-100.784 46.8083),1,ND,gdelt-bq.covid19.onlinenewsgeo,placeholder
2,2,2022-11-08 06:30:00,https://manchesterinklink.com/applications-ava...,Applications available for Restaurant Infrastr...,https://manchesterinklink.com/wp-content/uploa...,eng,2.016129,US,"New Hampshire, United States",43.4108,...,US,USNH,,2,picking up or having food delivered safe and e...,POINT(-71.5653 43.4108),1,NH,gdelt-bq.covid19.onlinenewsgeo,placeholder
3,3,2022-11-02 12:15:00,https://chippewa.com/community/dunnconnect/uw-...,UW - Stout research at Devil Punchbowl is focu...,https://bloximages.chicago2.vip.townnews.com/c...,eng,0.631579,US,"Eau Claire, Wisconsin, United States",44.8113,...,US,USWI,WI035,3,hadnt previously been identified in dunn count...,POINT(-91.4985 44.8113),1,WI,gdelt-bq.covid19.onlinenewsgeo,placeholder
4,4,2022-11-23 15:00:00,https://www.daily-journal.com/opinion/voice-of...,Voice of the People : Say No to violence again...,https://bloximages.newyork1.vip.townnews.com/d...,eng,-2.835052,US,"Kankakee, Illinois, United States",41.12,...,US,USIL,IL091,3,violence mostly by an intimate partner violenc...,POINT(-87.8612 41.12),1,IL,gdelt-bq.covid19.onlinenewsgeo,placeholder


In [53]:
df10 = df10.rename(columns={'Unnamed: 0': 'number'})


In [54]:
df10 = df10[['url', 'title','state_code']]

In [56]:
df10.head()

Unnamed: 0,url,title,state_code
0,https://www.npr.org/2022/11/29/1139822833/twit...,Twitter COVID misinformation policy will no lo...,NY
1,https://bismarcktribune.com/news/local/city-le...,City - led initiative takes aim at homelessnes...,ND
2,https://manchesterinklink.com/applications-ava...,Applications available for Restaurant Infrastr...,NH
3,https://chippewa.com/community/dunnconnect/uw-...,UW - Stout research at Devil Punchbowl is focu...,WI
4,https://www.daily-journal.com/opinion/voice-of...,Voice of the People : Say No to violence again...,IL


In [55]:
df10.reset_index(drop=True, inplace=True)

In [68]:
df10.head()

Unnamed: 0,url,title,state_code
0,https://www.npr.org/2022/11/29/1139822833/twit...,Twitter COVID misinformation policy will no lo...,NY
1,https://bismarcktribune.com/news/local/city-le...,City - led initiative takes aim at homelessnes...,ND
2,https://manchesterinklink.com/applications-ava...,Applications available for Restaurant Infrastr...,NH
3,https://chippewa.com/community/dunnconnect/uw-...,UW - Stout research at Devil Punchbowl is focu...,WI
4,https://www.daily-journal.com/opinion/voice-of...,Voice of the People : Say No to violence again...,IL


In [69]:
df10.to_csv('title.csv', index=False)

In [None]:
df10 = df10.fillna('null')

In [70]:
files.download('title.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [152]:
import pandas as pd

predictions_data_pd = final_predictions.select("predicted_category").toPandas()

In [156]:
predictions_data_pd.count()

predicted_category    2877831
dtype: int64

In [154]:
predictions_data_pd.to_csv("preds.csv", index=False)

In [155]:
from google.colab import files
files.download('preds.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [106]:
final_predictions.count()

2877831

In [66]:
df10.count()

url           2877831
title         2877831
state_code    2877831
dtype: int64

In [67]:
df10['title'].isnull().sum()


0

In [65]:
df10['title'] = df10['title'].fillna('No Title')


In [71]:
import pandas as pd

# Read the two CSV files
df11 = pd.read_csv('preds.csv')
df13 = pd.read_csv('title.csv')

# Concatenate the two dataframes horizontally
mer = pd.concat([df11, df13], axis=1)

# Save the merged dataframe to a CSV file
mer.to_csv('merged.csv', index=False)


In [72]:
files.download('merged.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [73]:
df4 = pd.read_csv('merged.csv')

In [74]:
df4.head()

Unnamed: 0,predicted_category,url,title,state_code
0,WOMEN,https://www.npr.org/2022/11/29/1139822833/twit...,Twitter COVID misinformation policy will no lo...,NY
1,POLITICS,https://bismarcktribune.com/news/local/city-le...,City - led initiative takes aim at homelessnes...,ND
2,POLITICS,https://manchesterinklink.com/applications-ava...,Applications available for Restaurant Infrastr...,NH
3,IMPACT,https://chippewa.com/community/dunnconnect/uw-...,UW - Stout research at Devil Punchbowl is focu...,WI
4,WOMEN,https://www.daily-journal.com/opinion/voice-of...,Voice of the People : Say No to violence again...,IL


In [75]:
df4.count()

predicted_category    2877831
url                   2877831
title                 2877831
state_code            2877831
dtype: int64