In [123]:
import os

project_root = "/Users/nimisha/Desktop/learning/news-reliability-detector/"
os.chdir(project_root)

In [124]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, udf, concat
from pyspark.sql.types import StringType

# local modules
from src.load.load_data import load
from src.preprocessing.preprocessing import (
    print_null_rows,
    drop_missing_text_rows,
    shuffle_rows,
    remove_stop_words,
    stem_words,
)

In [125]:
spark = SparkSession.builder.appName("NewsReliability").getOrCreate()

In [126]:
data = load(spark)

                                                                                

In [127]:
data.show(10)

+--------------------+--------------------+-------+-----------------+------+
|               title|                text|subject|             date|target|
+--------------------+--------------------+-------+-----------------+------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|     1|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|     1|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|     1|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|     1|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|     1|
| Racist Alabama C...|The number of cas...|   News|December 25, 2017|     1|
| Fresh Off The Go...|Donald Trump spen...|   News|December 23, 2017|     1|
| Trump Said Some ...|In the wake of ye...|   News|December 23, 2017|     1|
| Former CIA Direc...|Many people have ...|   News|December 22, 2017|     1|
| WATCH: Brand-New...|Just when you mig...|   News|December 21, 2017|     1|

In [128]:
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- target: integer (nullable = false)



# Handling Missing Values

In [129]:
print_null_rows(data)

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
|    0|   8|      8|   8|     0|
+-----+----+-------+----+------+



                                                                                

In [130]:
data.filter(data.text.isNull()).show()

+--------------------+----+-------+----+------+
|               title|text|subject|date|target|
+--------------------+----+-------+----+------+
|                   "|NULL|   NULL|NULL|     1|
|Ex-GOP Congressma...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|WATCH: Democratic...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|Trump Gets STOMPE...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|Donald Trump Gets...|NULL|   NULL|NULL|     1|
+--------------------+----+-------+----+------+



In [131]:
data = drop_missing_text_rows(data)

In [132]:
data.filter(data.text.isNull()).show()

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
+-----+----+-------+----+------+



In [133]:
print_null_rows(data)

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
|    0|   0|      0|   0|     0|
+-----+----+-------+----+------+



# Randomize the order of data rows

In [134]:
data = shuffle_rows(data)
data.show(10)

+--------------------+--------------------+------------+------------------+------+
|               title|                text|     subject|              date|target|
+--------------------+--------------------+------------+------------------+------+
|EMPLOYEES IMPLANT...|We reported last ...|   left-news|       Aug 1, 2017|     1|
|Brazil Supreme Co...|BRASILIA (Reuters...|   worldnews| October 13, 2017 |     0|
|Hillary Supporter...|TMZ caught up wit...|    politics|       Feb 9, 2017|     1|
|Canada's Trudeau ...|OTTAWA (Reuters) ...|politicsNews|   April 27, 2017 |     0|
|Mugabe given unti...|HARARE (Reuters) ...|   worldnews|November 19, 2017 |     0|
|WATCH: Democrat H...| The newly electe...|   left-news|      Dec 17, 2017|     1|
|FAKE NEWS! MAXINE...|                    |   left-news|      Mar 20, 2017|     1|
|Merkel meets Trum...|BERLIN (Reuters) ...|politicsNews|   March 12, 2017 |     0|
|THE VIDEO HILLARY...|Spread this EVERY...|    politics|       Mar 9, 2016|     1|
|Ven

## Feature Creating

In [135]:
# combine title and text column to make news content
data = data.withColumn('content', concat(data['title'], data['text']))
data.show(3)

+--------------------+--------------------+---------+-----------------+------+--------------------+
|               title|                text|  subject|             date|target|             content|
+--------------------+--------------------+---------+-----------------+------+--------------------+
|EMPLOYEES IMPLANT...|We reported last ...|left-news|      Aug 1, 2017|     1|EMPLOYEES IMPLANT...|
|Brazil Supreme Co...|BRASILIA (Reuters...|worldnews|October 13, 2017 |     0|Brazil Supreme Co...|
|Hillary Supporter...|TMZ caught up wit...| politics|      Feb 9, 2017|     1|Hillary Supporter...|
+--------------------+--------------------+---------+-----------------+------+--------------------+
only showing top 3 rows



In [136]:
data.select("content").show(n = 5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Drop columns

In [137]:
columns_to_drop = ["date", "title", "subject", "text"]
data = data.drop(*columns_to_drop)
data.show(2)

+------+--------------------+
|target|             content|
+------+--------------------+
|     1|EMPLOYEES IMPLANT...|
|     0|Brazil Supreme Co...|
+------+--------------------+
only showing top 2 rows



# Duplicate Rows

In [138]:
duplicate_rows = data.groupBy(data.columns).count().filter("count > 1")

print("Duplicates", duplicate_rows.count())
print("Total", data.count())
print("Distinct", data.distinct().count())

                                                                                

Duplicates 5305
Total 44899




Distinct 39106


                                                                                

In [139]:
duplicate_rows.show(10)



+------+--------------------+-----+
|target|             content|count|
+------+--------------------+-----+
|     1|JUST IN: SUSPECTE...|    2|
|     1|YOU’LL NEVER GUES...|    2|
|     1|HOUSE INTEL Slaps...|    2|
|     1|FLASHBACK: CNN HO...|    2|
|     1|PRICELESS! WATCH ...|    2|
|     1|CNN’S FAKE NEWS B...|    2|
|     1|WATCH: MEGHAN MCC...|    2|
|     1|“MAXINE WATERS IN...|    2|
|     1|HEY CNN…REMEMBER ...|    2|
|     1|HOW PRESIDENT TRU...|    2|
+------+--------------------+-----+
only showing top 10 rows



                                                                                

In [140]:
data = data.dropDuplicates()
data.count()

                                                                                

39106

# Text Transform 

### 1. Lower Case

In [141]:
# convert to all lower case
data = data.withColumn("content", lower(col("content")))
data.show(5)



+------+--------------------+
|target|             content|
+------+--------------------+
|     1| trump bucks gop ...|
|     1| australia prime ...|
|     1| mccain f*cked ov...|
|     1| american psychoa...|
|     1| senate gives tru...|
+------+--------------------+
only showing top 5 rows



                                                                                

### 2. Remove Special Characters 
- like ! @

In [142]:
data = data.withColumn("content", regexp_replace("content", "[^a-zA-Z0-9\\s]", ""))
data.show(5)



+------+--------------------+
|target|             content|
+------+--------------------+
|     1| trump bucks gop ...|
|     1| australia prime ...|
|     1| mccain fcked ove...|
|     1| american psychoa...|
|     1| senate gives tru...|
+------+--------------------+
only showing top 5 rows



                                                                                

### 3. Remove Stopwords

In [143]:
remove_stop_words_udf = udf(remove_stop_words, StringType())
data = data.withColumn("content", remove_stop_words_udf("content"))
data.show(10)

[Stage 507:>                                                        (0 + 1) / 1]

+------+--------------------+
|target|             content|
+------+--------------------+
|     1|mainstream media ...|
|     0|air strikes kill ...|
|     1|new normal massiv...|
|     0|syrias eastern gh...|
|     1|trumps staff thin...|
|     1|cher rosie desper...|
|     1|trump faking poll...|
|     1|illegal immigrant...|
|     0|des moines regist...|
|     0|trump denies aski...|
+------+--------------------+
only showing top 10 rows



                                                                                

### 4. Stemming words

In [144]:
stem_words_udf = udf(stem_words, StringType())
data = data.withColumn("content", stem_words_udf("content"))
data.show(5)

[Stage 514:>                                                        (0 + 1) / 1]

+------+--------------------+
|target|             content|
+------+--------------------+
|     1|mainstream media ...|
|     0|air strike kill 6...|
|     1|new normal massiv...|
|     0|syria eastern gho...|
|     1|trump staff think...|
+------+--------------------+
only showing top 5 rows



                                                                                

In [145]:
path = "data/processed"
if not os.path.exists(path):
    os.makedirs(path)

data.coalesce(1).toPandas().to_csv(
    "data/processed/processed.csv", index=False
)

                                                                                