In [15]:
import os

project_root = "/Users/nimisha/Desktop/learning/news-reliability-detector/"
os.chdir(project_root)

In [16]:
from pyspark.sql import SparkSession

# local modules
from src.load.load_data import load
from src.preprocessing.preprocessing import (
    print_null_rows,
    drop_missing_text_rows,
    shuffle_rows
)

In [17]:
spark = SparkSession.builder.appName("NewsReliability").getOrCreate()

In [18]:
data = load(spark)

In [19]:
data.show(10)

+--------------------+--------------------+-------+-----------------+------+
|               title|                text|subject|             date|target|
+--------------------+--------------------+-------+-----------------+------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|     1|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|     1|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|     1|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|     1|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|     1|
| Racist Alabama C...|The number of cas...|   News|December 25, 2017|     1|
| Fresh Off The Go...|Donald Trump spen...|   News|December 23, 2017|     1|
| Trump Said Some ...|In the wake of ye...|   News|December 23, 2017|     1|
| Former CIA Direc...|Many people have ...|   News|December 22, 2017|     1|
| WATCH: Brand-New...|Just when you mig...|   News|December 21, 2017|     1|

In [20]:
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- target: integer (nullable = false)



# Handling Missing Values

In [21]:
print_null_rows(data)

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
|    0|   8|      8|   8|     0|
+-----+----+-------+----+------+



In [22]:
data.filter(data.text.isNull()).show()

+--------------------+----+-------+----+------+
|               title|text|subject|date|target|
+--------------------+----+-------+----+------+
|                   "|NULL|   NULL|NULL|     1|
|Ex-GOP Congressma...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|WATCH: Democratic...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|Trump Gets STOMPE...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|Donald Trump Gets...|NULL|   NULL|NULL|     1|
+--------------------+----+-------+----+------+



In [23]:
data = drop_missing_text_rows(data)

In [24]:
data.filter(data.text.isNull()).show()

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
+-----+----+-------+----+------+



In [25]:
print_null_rows(data)

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
|    0|   0|      0|   0|     0|
+-----+----+-------+----+------+



# Randomize the order of data rows

In [26]:
data = shuffle_rows(data)
data.show(10)

+--------------------+--------------------+------------+------------------+------+
|               title|                text|     subject|              date|target|
+--------------------+--------------------+------------+------------------+------+
|HOW HILLARY’S ANT...|Maybe the Left be...|   left-news|      Jul 14, 2016|     1|
|Irish PM's party ...|DUBLIN (Reuters) ...|   worldnews|November 24, 2017 |     0|
|Sessions' talks w...|ABOARD AIR FORCE ...|politicsNews|    March 2, 2017 |     0|
| Snowflake Alert:...|White House couns...|        News|      May 11, 2017|     1|
|Iran aircraft dea...|PARIS (Reuters) -...|   worldnews| October 17, 2017 |     0|
|Iran still trying...|BERLIN (Reuters) ...|   worldnews| October 11, 2017 |     0|
|Highlights: The T...|(Reuters) - Highl...|politicsNews|    March 7, 2017 |     0|
|BREAKING: REPORTE...|Health profession...|   left-news|      Oct 10, 2016|     1|
|MICHELLE AND BARA...|President Barack ...|    politics|      Feb 20, 2016|     1|
|Tru

# Drop columns

In [27]:
columns_to_drop = ["date", "title"]
data = data.drop(*columns_to_drop)
data.show(2)

+--------------------+---------+------+
|                text|  subject|target|
+--------------------+---------+------+
|Maybe the Left be...|left-news|     1|
|DUBLIN (Reuters) ...|worldnews|     0|
+--------------------+---------+------+
only showing top 2 rows



# Transform Text