In [1]:
import os

project_root = "/Users/nimisha/Desktop/learning/news-reliability-detector/"
os.chdir(project_root)

In [2]:
from pyspark.sql import SparkSession

# local modules
from src.load.load_data import load
from src.preprocessing.preprocessing import (
    print_null_rows,
    drop_missing_text_rows,
    shuffle_rows
)

In [3]:
spark = SparkSession.builder.appName("NewsReliability").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/24 13:34:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
data = load(spark)

                                                                                

In [5]:
data.show(10)

+--------------------+--------------------+-------+-----------------+------+
|               title|                text|subject|             date|target|
+--------------------+--------------------+-------+-----------------+------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|     1|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|     1|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|     1|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|     1|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|     1|
| Racist Alabama C...|The number of cas...|   News|December 25, 2017|     1|
| Fresh Off The Go...|Donald Trump spen...|   News|December 23, 2017|     1|
| Trump Said Some ...|In the wake of ye...|   News|December 23, 2017|     1|
| Former CIA Direc...|Many people have ...|   News|December 22, 2017|     1|
| WATCH: Brand-New...|Just when you mig...|   News|December 21, 2017|     1|

In [6]:
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- target: integer (nullable = false)



# Handling Missing Values

In [7]:
print_null_rows(data)



+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
|    0|   8|      8|   8|     0|
+-----+----+-------+----+------+



                                                                                

In [8]:
data.filter(data.text.isNull()).show()

+--------------------+----+-------+----+------+
|               title|text|subject|date|target|
+--------------------+----+-------+----+------+
|                   "|NULL|   NULL|NULL|     1|
|Ex-GOP Congressma...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|WATCH: Democratic...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|Trump Gets STOMPE...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|Donald Trump Gets...|NULL|   NULL|NULL|     1|
+--------------------+----+-------+----+------+



In [9]:
data = drop_missing_text_rows(data)

In [10]:
data.filter(data.text.isNull()).show()

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
+-----+----+-------+----+------+



In [11]:
print_null_rows(data)

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
|    0|   0|      0|   0|     0|
+-----+----+-------+----+------+



# Randomize the order of data rows

In [12]:
data = shuffle_rows(data)
data.show(10)

+--------------------+--------------------+------------+-------------------+------+
|               title|                text|     subject|               date|target|
+--------------------+--------------------+------------+-------------------+------+
|Top U.S. Marine's...|WASHINGTON (Reute...|politicsNews|  December 5, 2016 |     0|
|British woman win...|COLOMBO (Reuters)...|   worldnews| November 15, 2017 |     0|
|Malawi vigilante ...|LILONGWE (Reuters...|   worldnews|  October 24, 2017 |     0|
|Russia says Syria...|MOSCOW (Reuters) ...|politicsNews|     April 7, 2017 |     0|
| Amazon Sells Out...|Conservatives are...|        News|   January 16, 2017|     1|
|FACEBOOK HIRES PO...|Facebook has anno...|   left-news|       Dec 21, 2016|     1|
|London demands an...|HONG KONG (Reuter...|   worldnews|  October 11, 2017 |     0|
|Berlusconi's cent...|PALERMO, Italy (R...|   worldnews|  November 5, 2017 |     0|
|Syria producing m...|DAMASCUS (Reuters...|   worldnews|September 26, 2017 |