In [1]:
import os

project_root = "/Users/nimisha/Desktop/learning/news-reliability-detector/"
os.chdir(project_root)

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, udf
from pyspark.sql.types import StringType

# local modules
from src.load.load_data import load
from src.preprocessing.preprocessing import (
    print_null_rows,
    drop_missing_text_rows,
    shuffle_rows,
    remove_stop_words,
    stem_words
)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nimisha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
spark = SparkSession.builder.appName("NewsReliability").getOrCreate()

23/12/24 17:14:11 WARN Utils: Your hostname, centella.local resolves to a loopback address: 127.0.0.1; using 192.168.70.82 instead (on interface en0)
23/12/24 17:14:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/24 17:14:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
data = load(spark)

In [5]:
data.show(10)

+--------------------+--------------------+-------+-----------------+------+
|               title|                text|subject|             date|target|
+--------------------+--------------------+-------+-----------------+------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|     1|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|     1|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|     1|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|     1|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|     1|
| Racist Alabama C...|The number of cas...|   News|December 25, 2017|     1|
| Fresh Off The Go...|Donald Trump spen...|   News|December 23, 2017|     1|
| Trump Said Some ...|In the wake of ye...|   News|December 23, 2017|     1|
| Former CIA Direc...|Many people have ...|   News|December 22, 2017|     1|
| WATCH: Brand-New...|Just when you mig...|   News|December 21, 2017|     1|

In [6]:
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- target: integer (nullable = false)



# Handling Missing Values

In [7]:
print_null_rows(data)

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
|    0|   8|      8|   8|     0|
+-----+----+-------+----+------+



                                                                                

In [8]:
data.filter(data.text.isNull()).show()

+--------------------+----+-------+----+------+
|               title|text|subject|date|target|
+--------------------+----+-------+----+------+
|                   "|NULL|   NULL|NULL|     1|
|Ex-GOP Congressma...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|WATCH: Democratic...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|Trump Gets STOMPE...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|Donald Trump Gets...|NULL|   NULL|NULL|     1|
+--------------------+----+-------+----+------+



In [9]:
data = drop_missing_text_rows(data)

In [10]:
data.filter(data.text.isNull()).show()

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
+-----+----+-------+----+------+



In [11]:
print_null_rows(data)

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
|    0|   0|      0|   0|     0|
+-----+----+-------+----+------+



# Randomize the order of data rows

In [12]:
data = shuffle_rows(data)
data.show(10)

+--------------------+--------------------+---------------+------------------+------+
|               title|                text|        subject|              date|target|
+--------------------+--------------------+---------------+------------------+------+
|Yahoo under scrut...|NEW YORK (Reuters...|   politicsNews|December 15, 2016 |     0|
|TRUMP SETS THE RE...|                    |       politics|      Nov 16, 2016|     1|
|UK PM May says sh...|LONDON (Reuters) ...|      worldnews|December 22, 2017 |     0|
|DEMOCRAT MOCKS AM...|Not that it matte...|      left-news|       Jun 5, 2015|     1|
|YIKES! WHAT THE H...|Hillary was seen ...|       politics|       Sep 6, 2016|     1|
|Saudi minister sa...|BEIRUT (Reuters) ...|      worldnews| November 4, 2017 |     0|
|Italian woman jai...|COPENHAGEN (Reute...|      worldnews|December 15, 2017 |     0|
| Bette Midler Goe...|Once again, Bette...|           News| February 25, 2016|     1|
|THE ‘IN YOUR FACE...|Yes, that s Justi...|Government 

# Drop columns

In [13]:
columns_to_drop = ["date", "title"]
data = data.drop(*columns_to_drop)
data.show(2)

+--------------------+------------+------+
|                text|     subject|target|
+--------------------+------------+------+
|NEW YORK (Reuters...|politicsNews|     0|
|                    |    politics|     1|
+--------------------+------------+------+
only showing top 2 rows



# Duplicate Rows

In [14]:
duplicate_rows = data.groupBy(data.columns).count().filter("count > 1")

print("Duplicates", duplicate_rows.count())
print("Total", data.count())
print("Distinct", data.distinct().count())

                                                                                

Duplicates 465
Total 44899




Distinct 43782


                                                                                

In [15]:
duplicate_rows.show(10)

+--------------------+--------------------+------+-----+
|                text|             subject|target|count|
+--------------------+--------------------+------+-----+
|"#JokeNewsAfter r...| 2017The former M...|     1|    2|
|"Liberal actor an...| 2017Harvey Weins...|     1|    2|
|"President Trump ...| we're not lookin...|     1|    2|
|"Judge Jeanine le...| and?"" he says. ...|     1|    2|
|"Local artist Ric...| 2017Later on Twi...|     1|    2|
|"Brigitte Macron ...|""  THIS: pic.twi...|     1|    2|
|" I m a political...|               2017"|     1|    2|
|"This unbelievabl...|               2016"|     1|    2|
|"Remember the tim...|     fjs);}(document|     1|    2|
|"Barack Obama has...|        undocumented|     1|    2|
+--------------------+--------------------+------+-----+
only showing top 10 rows



In [16]:
data = data.dropDuplicates()
data.count()

                                                                                

43782

# Text Transform 

### 1. Lower Case

In [17]:
# convert to all lower case
data = data.withColumn("text", lower(col("text")))
data = data.withColumn("subject", lower(col("subject")))
data.show(5)

+--------------------+-------+------+
|                text|subject|target|
+--------------------+-------+------+
|president donald ...|   news|     1|
|the republican fa...|   news|     1|
|there s a lot of ...|   news|     1|
|everyone knows wh...|   news|     1|
|donald trump took...|   news|     1|
+--------------------+-------+------+
only showing top 5 rows



### 2. Remove Special Characters 
- like ! @

In [18]:
data = data.withColumn("text", regexp_replace("text", "[^a-zA-Z0-9\\s]", ""))
data.show(5)

+--------------------+-------+------+
|                text|subject|target|
+--------------------+-------+------+
|president donald ...|   news|     1|
|the republican fa...|   news|     1|
|there s a lot of ...|   news|     1|
|everyone knows wh...|   news|     1|
|donald trump took...|   news|     1|
+--------------------+-------+------+
only showing top 5 rows



### 3. Remove Stopwords

In [19]:
remove_stop_words_udf = udf(remove_stop_words, StringType())
data = data.withColumn("text", remove_stop_words_udf("text"))
data.show(10)

[Stage 54:>                                                         (0 + 1) / 1]

+--------------------+------------+------+
|                text|     subject|target|
+--------------------+------------+------+
|disrespectful cry...|   left-news|     1|
|reuters us suprem...|politicsnews|     0|
|trend tragic gun ...|        news|     1|
|message peace uni...|    politics|     1|
|jay dyer 21st cen...| middle-east|     1|
|man arrested miam...|   left-news|     1|
|reince priebus do...|        news|     1|
|reuters ailing pu...|politicsnews|     0|
|reuters us attorn...|politicsnews|     0|
|washington reuter...|politicsnews|     0|
+--------------------+------------+------+
only showing top 10 rows



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nimisha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
                                                                                

### 4. Stemming words

In [22]:
stem_words_udf = udf(stem_words, StringType())
data = data.withColumn("text", stem_words_udf("text"))
data.show(5)

[Stage 61:>                                                         (0 + 1) / 1]

+--------------------+------------+------+
|                text|     subject|target|
+--------------------+------------+------+
|disrespect crybab...|   left-news|     1|
|reuter us suprem ...|politicsnews|     0|
|trend tragic gun ...|        news|     1|
|messag peac uniti...|    politics|     1|
|jay dyer 21st cen...| middle-east|     1|
+--------------------+------------+------+
only showing top 5 rows



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nimisha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
                                                                                