In [1]:
import os
import requests
import codecs
import re
import nltk
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from tqdm import tqdm

In [2]:
from pyspark.sql.types import StructField, StructType

In [3]:
if os.path.exists("./data/clean"):
    pass
else:
    os.mkdir("./data/clean")

In [4]:
#Create SparkConf
sparkConf =SparkConf().setAppName('DataClean').setMaster('local[*]')
#Create SparkContext
sc=SparkContext(conf=sparkConf)

In [5]:
spark = SparkSession(sc)

In [6]:
f = codecs.open("./data/stopwords.txt")
stopwords = f.readlines()
f.close()
stopwords = [word.strip() for word in stopwords]

In [7]:
def cut_and_filter(sentence):
    sentence2 = re.sub('[^\w ]','',sentence).lower()
    cut_result = nltk.word_tokenize(sentence2)
    result = [word for word in cut_result if word not in stopwords]
    return " ".join(result)

In [8]:
def cut(sentence):
    sentence2 = re.sub('[^\w ]','',sentence).lower()
    cut_result = nltk.word_tokenize(sentence2)
    return " ".join(cut_result)

In [9]:
def process_columns(row):
    processed_row = Row(genre= str(row.genre),
                        sentence1= cut_and_filter(row.sentence1), 
                        sentence2= cut_and_filter(row.sentence2))
    return processed_row

In [10]:
def process_columns_without_filter(row):
    processed_row = Row(genre= str(row.genre),
                        sentence1= cut(row.sentence1), 
                        sentence2= cut(row.sentence2))
    return processed_row

In [11]:
filenames = list(filter(lambda x: x.endswith(".tsv"),os.listdir("./data/MNLI/")))
with tqdm(filenames, total=len(filenames)) as t:
    for filename in t:
        t.set_postfix_str("processing with {}".format(filename))
        file = filename.split(".")[0]
        df = spark.read.option("header", "true").csv("./data/MNLI/{}".format(filename), sep= "\t")
        df_filter = df.dropna().drop_duplicates().select(["genre", "sentence1", "sentence2"])
        temp_rdd = df_filter.rdd.map(lambda x: process_columns(x)).map(lambda x:(
            x.genre,x.sentence1,x.sentence2
        ))
        schema = StructType([
            StructField('genre', StringType()),
            StructField('sentence1', StringType()),
            StructField('sentence2', StringType())])
        df_new = spark.createDataFrame(temp_rdd, schema= schema)
        df_new.write.csv("./data/clean/{}".format(file), mode="overwrite", sep="\t", header= True)

100%|██████████| 5/5 [00:33<00:00,  6.67s/it, processing with train.tsv]          


In [12]:
filename="train.tsv"
file = filename.split(".")[0]
df = spark.read.option("header", "true").csv("./data/MNLI/{}".format(filename), sep= "\t")
df_filter = df.dropna().drop_duplicates().select(["genre", "sentence1", "sentence2"])
temp_rdd = df_filter.rdd.map(lambda x: process_columns_without_filter(x)).map(lambda x:(
    x.genre,x.sentence1,x.sentence2
))
schema = StructType([
    StructField('genre', StringType()),
    StructField('sentence1', StringType()),
    StructField('sentence2', StringType())])
df_new = spark.createDataFrame(temp_rdd, schema= schema)
df_new.write.csv("./data/clean/nofilter_{}".format(file), mode="overwrite", sep="\t", header= True)