# N-Grams Feature Generation

In [None]:
import sys
sys.path.append("..")
import glob
import os
from helpers.data_prep_and_print import print_df
from pyspark.sql.functions import lit, col, startswith
from pyspark.sql.types import StructType, StringType
from pyspark.ml.feature import NGram, Tokenizer
from pyspark.sql import SparkSession

In [None]:
# Create local StreamingContext with a batch interval of 10s
spark = (SparkSession.builder
           .appName("N-Gram Creation")
           .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

## Read in the Data Files

In [None]:
schema = StructType() \
      .add("label",StringType(),True) \
      .add("sentence",StringType(),True)
input_folder = "../data/labeled_articles/"
file_list=glob.glob("../data/labeled_articles/*.txt")
dfs = []
for filename in file_list:
    temp_df = spark.read.option('header', 'false') \
        .option("delimiter", "  ") \
        .schema(schema) \
        .csv(file_list)
    temp_df= temp_df.withColumn("filename", lit(os.path.basename(filename)))
    dfs.append(temp_df)
union_df = dfs[0]
for df in dfs[1:]:
    union_df = union_df.union(df)

union_df = union_df.where(~(col("label").startswith("###")))
print(union_df.count())
union_df.printSchema()
print_df(union_df, 5)

## Build N-Grams

In [None]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenized_df = tokenizer.transform(union_df)
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngram_df = ngram.transform(tokenized_df)
print_df(ngram_df, 5)

In [None]:
spark.stop()