In [35]:
import os

project_root = "/Users/nimisha/Desktop/learning/news-reliability-detector/"
os.chdir(project_root)

In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, udf, concat
from pyspark.sql.types import StringType
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# local modules
from src.load.load_data import load
from src.preprocessing.preprocessing import (
    print_null_rows,
    drop_missing_text_rows,
    shuffle_rows,
    remove_stop_words,
    stem_words
)

In [37]:
spark = SparkSession.builder.appName("NewsReliability").getOrCreate()

In [38]:
data = load(spark)

In [39]:
data.show(10)

+--------------------+--------------------+-------+-----------------+------+
|               title|                text|subject|             date|target|
+--------------------+--------------------+-------+-----------------+------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|     1|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|     1|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|     1|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|     1|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|     1|
| Racist Alabama C...|The number of cas...|   News|December 25, 2017|     1|
| Fresh Off The Go...|Donald Trump spen...|   News|December 23, 2017|     1|
| Trump Said Some ...|In the wake of ye...|   News|December 23, 2017|     1|
| Former CIA Direc...|Many people have ...|   News|December 22, 2017|     1|
| WATCH: Brand-New...|Just when you mig...|   News|December 21, 2017|     1|

In [40]:
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- target: integer (nullable = false)



# Handling Missing Values

In [41]:
print_null_rows(data)

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
|    0|   8|      8|   8|     0|
+-----+----+-------+----+------+



In [42]:
data.filter(data.text.isNull()).show()

+--------------------+----+-------+----+------+
|               title|text|subject|date|target|
+--------------------+----+-------+----+------+
|                   "|NULL|   NULL|NULL|     1|
|Ex-GOP Congressma...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|WATCH: Democratic...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|Trump Gets STOMPE...|NULL|   NULL|NULL|     1|
|                   "|NULL|   NULL|NULL|     1|
|Donald Trump Gets...|NULL|   NULL|NULL|     1|
+--------------------+----+-------+----+------+



In [43]:
data = drop_missing_text_rows(data)

In [44]:
data.filter(data.text.isNull()).show()

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
+-----+----+-------+----+------+



In [45]:
print_null_rows(data)

+-----+----+-------+----+------+
|title|text|subject|date|target|
+-----+----+-------+----+------+
|    0|   0|      0|   0|     0|
+-----+----+-------+----+------+



# Randomize the order of data rows

In [46]:
data = shuffle_rows(data)
data.show(10)

+--------------------+--------------------+------------+-------------------+------+
|               title|                text|     subject|               date|target|
+--------------------+--------------------+------------+-------------------+------+
|Senate bill would...|CHICAGO (Reuters)...|politicsNews|September 27, 2016 |     0|
|German minister u...|BERLIN (Reuters) ...|   worldnews|  October 14, 2017 |     0|
| Trump Laughably ...|Donald Trump is n...|        News|    January 1, 2017|     1|
|Pope Francis plan...|ROME (Reuters) - ...|   worldnews| September 6, 2017 |     0|
|Trump says popula...|WASHINGTON (Reute...|politicsNews|  October 23, 2017 |     0|
|U.S. House Republ...|WASHINGTON (Reute...|politicsNews|   January 3, 2017 |     0|
|There’s Something...|21st Century Wire...| Middle-east|     March 21, 2016|     1|
| John McCain’s Op...|This is why John ...|        News|      June 18, 2016|     1|
|Swiss ready to me...|BERNE (Reuters) -...|   worldnews| September 4, 2017 |

## Feature Creating

In [47]:
# combine title and text column to make news content
data = data.withColumn('content', concat(data['title'], data['text']))
data.show(3)

+--------------------+--------------------+------------+-------------------+------+--------------------+
|               title|                text|     subject|               date|target|             content|
+--------------------+--------------------+------------+-------------------+------+--------------------+
|Senate bill would...|CHICAGO (Reuters)...|politicsNews|September 27, 2016 |     0|Senate bill would...|
|German minister u...|BERLIN (Reuters) ...|   worldnews|  October 14, 2017 |     0|German minister u...|
| Trump Laughably ...|Donald Trump is n...|        News|    January 1, 2017|     1| Trump Laughably ...|
+--------------------+--------------------+------------+-------------------+------+--------------------+
only showing top 3 rows



In [48]:
data.select("content").show(n = 5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Drop columns

In [49]:
columns_to_drop = ["date", "title", "subject", "text"]
data = data.drop(*columns_to_drop)
data.show(2)

+------+--------------------+
|target|             content|
+------+--------------------+
|     0|Senate bill would...|
|     0|German minister u...|
+------+--------------------+
only showing top 2 rows



# Duplicate Rows

In [50]:
duplicate_rows = data.groupBy(data.columns).count().filter("count > 1")

print("Duplicates", duplicate_rows.count())
print("Total", data.count())
print("Distinct", data.distinct().count())

                                                                                

Duplicates 5305
Total 44899




Distinct 39106


                                                                                

In [51]:
duplicate_rows.show(10)

+------+--------------------+-----+
|target|             content|count|
+------+--------------------+-----+
|     1|JUST IN: SUSPECTE...|    2|
|     1|YOU’LL NEVER GUES...|    2|
|     1|HOUSE INTEL Slaps...|    2|
|     1|FLASHBACK: CNN HO...|    2|
|     1|PRICELESS! WATCH ...|    2|
|     1|CNN’S FAKE NEWS B...|    2|
|     1|WATCH: MEGHAN MCC...|    2|
|     1|“MAXINE WATERS IN...|    2|
|     1|HEY CNN…REMEMBER ...|    2|
|     1|HOW PRESIDENT TRU...|    2|
+------+--------------------+-----+
only showing top 10 rows



                                                                                

In [52]:
data = data.dropDuplicates()
data.count()

                                                                                

39106

# Text Transform 

### 1. Lower Case

In [53]:
# convert to all lower case
data = data.withColumn("content", lower(col("content")))
data.show(5)

+------+--------------------+
|target|             content|
+------+--------------------+
|     1| trump bucks gop ...|
|     1| australia prime ...|
|     1| mccain f*cked ov...|
|     1| american psychoa...|
|     1| senate gives tru...|
+------+--------------------+
only showing top 5 rows



                                                                                

### 2. Remove Special Characters 
- like ! @

In [54]:
data = data.withColumn("content", regexp_replace("content", "[^a-zA-Z0-9\\s]", ""))
data.show(5)

+------+--------------------+
|target|             content|
+------+--------------------+
|     1| trump bucks gop ...|
|     1| australia prime ...|
|     1| mccain fcked ove...|
|     1| american psychoa...|
|     1| senate gives tru...|
+------+--------------------+
only showing top 5 rows



### 3. Remove Stopwords

In [55]:
remove_stop_words_udf = udf(remove_stop_words, StringType())
data = data.withColumn("content", remove_stop_words_udf("content"))
data.show(10)

[Stage 184:>                                                        (0 + 1) / 1]

+------+--------------------+
|target|             content|
+------+--------------------+
|     1|comes iran obama ...|
|     1|obama something s...|
|     1|obama uses speech...|
|     0|obama nominate ac...|
|     1|tim kaine reveals...|
|     0|myanmar says secu...|
|     0|debate protesters...|
|     1|mainstream media ...|
|     0|house speaker rya...|
|     0|mccain calls trum...|
+------+--------------------+
only showing top 10 rows



                                                                                

### 4. Stemming words

In [56]:
stem_words_udf = udf(stem_words, StringType())
data = data.withColumn("content", stem_words_udf("content"))
data.show(5)



+------+--------------------+
|target|             content|
+------+--------------------+
|     1|come iran obama c...|
|     1|obama someth say ...|
|     1|obama use speech ...|
|     0|obama nomin act e...|
|     1|tim kain reveal w...|
+------+--------------------+
only showing top 5 rows



                                                                                

## Train Test Split

In [57]:
train_data, test_data = data.randomSplit(weights=[0.8, 0.2], seed=100)

In [58]:
# print(train_data.count())
# print(test_data.count())

In [59]:
feature_columns = ["content"]
target_column = "target"

X_train = train_data.select(*feature_columns)
X_test = test_data.select(*feature_columns)

y_train = train_data.select(target_column)
y_test = test_data.select(target_column)

In [60]:
# print(X_train.count())
# print(y_train.count())

# print(X_test.count())
# print(y_test.count())

### Count vectorization
- count how many times each word show up in the text - frequency count
-  transforms the count into DTM(Document Term Matrix)
- Matices are store as sparse matrix to save space, as many values are 0
- Bag of Words

### TF-IDF 
- Term Frequency - Inverse Document Frequency
- address issue with document frequency
- instead of filling DMT with word frequency count, it calculates TD-IDF
- TF: raw count of a term in the document
- but words that are common accross documents? IDF !
- IDF: diminishes the weight of term that occur frequently in the document set and increases the weight of term that occess rarely. 

### TfidfVectorizer
- does both: create bag of word and convert to tfidf

### Tokenization
- split the content into words array

In [61]:
tokenizer = Tokenizer(inputCol="content", outputCol="tokenized")
X_train = tokenizer.transform(X_train)
X_train.show(3)

[Stage 198:>                                                        (0 + 1) / 1]

+--------------------+--------------------+
|             content|           tokenized|
+--------------------+--------------------+
|17yearold danish ...|[17yearold, danis...|
|2 hous republican...|[2, hous, republi...|
|460 peopl injur c...|[460, peopl, inju...|
+--------------------+--------------------+
only showing top 3 rows



                                                                                

In [62]:
X_test = tokenizer.transform(X_test)
X_test.show(3)

[Stage 205:>                                                        (0 + 1) / 1]

+--------------------+--------------------+
|             content|           tokenized|
+--------------------+--------------------+
|900 state depart ...|[900, state, depa...|
|act sec chair sig...|[act, sec, chair,...|
|ahead debat trump...|[ahead, debat, tr...|
+--------------------+--------------------+
only showing top 3 rows



                                                                                

### HashingTf 
- for term frequencies

In [63]:
hashingTf = HashingTF(inputCol="tokenized", outputCol="tfhashed", numFeatures=100)
X_train = hashingTf.transform(X_train)
X_train.show(10)

[Stage 212:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------------+
|             content|           tokenized|            tfhashed|
+--------------------+--------------------+--------------------+
|17yearold danish ...|[17yearold, danis...|(100,[0,1,8,14,15...|
|2 hous republican...|[2, hous, republi...|(100,[2,4,5,11,13...|
|460 peopl injur c...|[460, peopl, inju...|(100,[5,6,8,9,12,...|
|92 million sign o...|[92, million, sig...|(100,[1,2,4,5,6,7...|
|abadi defend role...|[abadi, defend, r...|(100,[0,1,2,3,4,5...|
|abandon philippin...|[abandon, philipp...|(100,[0,1,2,3,4,5...|
|abe optimist posi...|[abe, optimist, p...|(100,[1,3,8,11,12...|
|abe trump agre ra...|[abe, trump, agre...|(100,[3,5,7,11,12...|
|ackman valeant pl...|[ackman, valeant,...|(100,[0,1,2,3,4,5...|
|activist ire rise...|[activist, ire, r...|(100,[0,1,2,3,4,5...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



                                                                                

In [64]:
X_test = hashingTf.transform(X_test)
X_test.show(3)

[Stage 219:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------------+
|             content|           tokenized|            tfhashed|
+--------------------+--------------------+--------------------+
|900 state depart ...|[900, state, depa...|(100,[8,10,12,16,...|
|act sec chair sig...|[act, sec, chair,...|(100,[0,1,2,3,4,5...|
|ahead debat trump...|[ahead, debat, tr...|(100,[1,2,3,4,6,8...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



                                                                                



### IDF
- for inverse document frequency



In [65]:
idf = IDF(inputCol="tfhashed", outputCol="features")
idfModel = idf.fit(X_train)
X_train = idfModel.transform(X_train)
X_train.show(3)

[Stage 234:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+
|             content|           tokenized|            tfhashed|            features|
+--------------------+--------------------+--------------------+--------------------+
|17yearold danish ...|[17yearold, danis...|(100,[0,1,8,14,15...|(100,[0,1,8,14,15...|
|2 hous republican...|[2, hous, republi...|(100,[2,4,5,11,13...|(100,[2,4,5,11,13...|
|460 peopl injur c...|[460, peopl, inju...|(100,[5,6,8,9,12,...|(100,[5,6,8,9,12,...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



                                                                                

In [66]:
X_test = idfModel.transform(X_test)
X_test.show(3)

[Stage 241:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+
|             content|           tokenized|            tfhashed|            features|
+--------------------+--------------------+--------------------+--------------------+
|900 state depart ...|[900, state, depa...|(100,[8,10,12,16,...|(100,[8,10,12,16,...|
|act sec chair sig...|[act, sec, chair,...|(100,[0,1,2,3,4,5...|(100,[0,1,2,3,4,5...|
|ahead debat trump...|[ahead, debat, tr...|(100,[1,2,3,4,6,8...|(100,[1,2,3,4,6,8...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



                                                                                

In [67]:
X_test = X_test.select("features")
X_test.show(1)

[Stage 248:>                                                        (0 + 1) / 1]

+--------------------+
|            features|
+--------------------+
|(100,[8,10,12,16,...|
+--------------------+
only showing top 1 row



                                                                                

In [68]:
X_train = X_train.select("features")
X_train.show(1)

[Stage 255:>                                                        (0 + 1) / 1]

+--------------------+
|            features|
+--------------------+
|(100,[0,1,8,14,15...|
+--------------------+
only showing top 1 row



                                                                                