In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, DoubleType
import langid
import preprocessor as pp

In [23]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
import string
import re

In [14]:
#check if string is blank
def check_blanks(data_str):
    is_blank = str(data_str.isspace())
    return is_blank

In [32]:
#check if language of text is english or not
def check_lang(data_str):
    predict_lang = langid.classify(data_str)
    if predict_lang[1] >= .8:
        language = predict_lang[0]
    else:
        language = 'NA'
    return language

In [16]:
#remove unwanted chars
def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    mention_re = re.compile('@(\w+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove @mentions
    data_str = mention_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 3 characters
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = word
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = cleaned_str + ' ' + word
            else:
                cleaned_str += ' '
        list_pos += 1
    return cleaned_str

In [17]:
#stop words removal
def remove_stops(data_str):
    # expects a string
    stops = set(stopwords.words("english"))
    list_pos = 0
    cleaned_str = ''
    text = data_str.split()
    for word in text:
        if word not in stops:
            # rebuild cleaned_str
            if list_pos == 0:
                cleaned_str = word
            else:
                cleaned_str = cleaned_str + ' ' + word
            list_pos += 1
    return cleaned_str

In [18]:
#tagging text
def tag_and_remove(data_str):
    cleaned_str = ' '
    # noun tags
    nn_tags = ['NN', 'NNP', 'NNP', 'NNPS', 'NNS']
    # adjectives
    jj_tags = ['JJ', 'JJR', 'JJS']
    # verbs
    vb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    nltk_tags = nn_tags + jj_tags + vb_tags

    # break string into 'words'
    text = data_str.split()

    # tag the text and keep only those with the right tags
    tagged_text = pos_tag(text)
    for tagged_word in tagged_text:
        if tagged_word[1] in nltk_tags:
            cleaned_str += tagged_word[0] + ' '

    return cleaned_str

In [19]:
#lemmatization
def lemmatize(data_str):
    # expects a string
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    text = data_str.split()
    tagged_words = pos_tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str

In [33]:
# Register all the functions in Preproc with Spark Context
check_lang_udf = udf(check_lang, StringType())
remove_stops_udf = udf(remove_stops, StringType())
remove_features_udf = udf(remove_features, StringType())
tag_and_remove_udf = udf(tag_and_remove, StringType())
lemmatize_udf = udf(lemmatize, StringType())
check_blanks_udf = udf(check_blanks, StringType())

In [24]:
import pyspark
from pyspark.sql import SQLContext

# create spark contexts
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)

In [34]:
# Load a text file and convert each line to a Row.
data_rdd = sc.textFile("../data/raw_data.txt")
parts_rdd = data_rdd.map(lambda l: l.split("\t"))

# Filter bad rows out
garantee_col_rdd = parts_rdd.filter(lambda l: len(l) == 3)
typed_rdd = garantee_col_rdd.map(lambda p: (p[0], p[1], float(p[2])))

#Create DataFrame
data_df = sqlContext.createDataFrame(typed_rdd, ["text", "id", "label"])

# get the raw columns
raw_cols = data_df.columns

#data_df.show()
data_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- id: string (nullable = true)
 |-- label: double (nullable = true)



In [35]:
data_df.show(5)

+--------------------+------------------+-----+
|                text|                id|label|
+--------------------+------------------+-----+
|Fresh install of ...|        1018769417|  1.0|
|Well. Now I know ...|       10284216536|  1.0|
|"Literally six we...|       10298589026|  1.0|
|Mitsubishi i MiEV...|109017669432377344|  1.0|
|'Cheap Eats in SL...|109642968603963392|  1.0|
+--------------------+------------------+-----+
only showing top 5 rows



In [36]:
lang_df = data_df.withColumn("lang", check_lang_udf(data_df["text"]))
en_df = lang_df.filter(lang_df["lang"] == "en")
en_df.show(4)

+----+---+-----+----+
|text| id|label|lang|
+----+---+-----+----+
+----+---+-----+----+



In [37]:
lang_df.show(5)

+--------------------+------------------+-----+----+
|                text|                id|label|lang|
+--------------------+------------------+-----+----+
|Fresh install of ...|        1018769417|  1.0|  NA|
|Well. Now I know ...|       10284216536|  1.0|  NA|
|"Literally six we...|       10298589026|  1.0|  NA|
|Mitsubishi i MiEV...|109017669432377344|  1.0|  NA|
|'Cheap Eats in SL...|109642968603963392|  1.0|  NA|
+--------------------+------------------+-----+----+
only showing top 5 rows



In [39]:
rm_stops_df = data_df.select(raw_cols)\
                   .withColumn("stop_text", remove_stops_udf(en_df["text"]))
rm_stops_df.show(4)

+--------------------+------------------+-----+--------------------+
|                text|                id|label|           stop_text|
+--------------------+------------------+-----+--------------------+
|Fresh install of ...|        1018769417|  1.0|Fresh install XP ...|
|Well. Now I know ...|       10284216536|  1.0|Well. Now I know ...|
|"Literally six we...|       10298589026|  1.0|"Literally six we...|
|Mitsubishi i MiEV...|109017669432377344|  1.0|Mitsubishi MiEV -...|
+--------------------+------------------+-----+--------------------+
only showing top 4 rows



In [40]:
rm_features_df = rm_stops_df.select(raw_cols+["stop_text"])\
                            .withColumn("feat_text", \
                            remove_features_udf(rm_stops_df["stop_text"]))
rm_features_df.show(4)

+--------------------+------------------+-----+--------------------+--------------------+
|                text|                id|label|           stop_text|           feat_text|
+--------------------+------------------+-----+--------------------+--------------------+
|Fresh install of ...|        1018769417|  1.0|Fresh install XP ...|fresh install  ne...|
|Well. Now I know ...|       10284216536|  1.0|Well. Now I know ...|well now  know   ...|
|"Literally six we...|       10298589026|  1.0|"Literally six we...|literally six wee...|
|Mitsubishi i MiEV...|109017669432377344|  1.0|Mitsubishi MiEV -...|mitsubishi miev w...|
+--------------------+------------------+-----+--------------------+--------------------+
only showing top 4 rows



In [41]:
tagged_df = rm_features_df.select(raw_cols+["feat_text"]) \
                          .withColumn("tagged_text", \
                           tag_and_remove_udf(rm_features_df.feat_text))

tagged_df.show(4)

+--------------------+------------------+-----+--------------------+--------------------+
|                text|                id|label|           feat_text|         tagged_text|
+--------------------+------------------+-----+--------------------+--------------------+
|Fresh install of ...|        1018769417|  1.0|fresh install  ne...| fresh install ne...|
|Well. Now I know ...|       10284216536|  1.0|well now  know   ...| know want knives...|
|"Literally six we...|       10298589026|  1.0|literally six wee...| weeks take ssc c...|
|Mitsubishi i MiEV...|109017669432377344|  1.0|mitsubishi miev w...| mitsubishi miev ...|
+--------------------+------------------+-----+--------------------+--------------------+
only showing top 4 rows



In [42]:
lemm_df = tagged_df.select(raw_cols+["tagged_text"]) \
                   .withColumn("lemm_text", lemmatize_udf(tagged_df["tagged_text"]))
lemm_df.show(4)

+--------------------+------------------+-----+--------------------+--------------------+
|                text|                id|label|         tagged_text|           lemm_text|
+--------------------+------------------+-----+--------------------+--------------------+
|Fresh install of ...|        1018769417|  1.0| fresh install ne...|fresh install new...|
|Well. Now I know ...|       10284216536|  1.0| know want knives...|know want knife c...|
|"Literally six we...|       10298589026|  1.0| weeks take ssc c...|week take ssc cha...|
|Mitsubishi i MiEV...|109017669432377344|  1.0| mitsubishi miev ...|mitsubishi miev w...|
+--------------------+------------------+-----+--------------------+--------------------+
only showing top 4 rows



In [43]:
#remove blank rows and duplicates
check_blanks_df = lemm_df.select(raw_cols+["lemm_text"])\
                             .withColumn("is_blank", check_blanks_udf(lemm_df["lemm_text"]))
# remove blanks
no_blanks_df = check_blanks_df.filter(check_blanks_df["is_blank"] == "False")

# drop duplicates
dedup_df = no_blanks_df.dropDuplicates(['text', 'label'])

dedup_df.show(4)

+--------------------+-----------+-----+--------------------+--------+
|                text|         id|label|           lemm_text|is_blank|
+--------------------+-----------+-----+--------------------+--------+
|Fresh install of ...| 1018769417|  1.0|fresh install new...|   False|
|"Did I really nee...|12358025545|  1.0|do need learn bou...|   False|
|"Literally six we...|10298589026|  1.0|week take ssc cha...|   False|
|hi all - i'm goin...| 1208319583|  1.0|go tweet thing lo...|   False|
+--------------------+-----------+-----+--------------------+--------+
only showing top 4 rows



In [44]:
#add unique id
from pyspark.sql.functions import monotonically_increasing_id
# Create Unique ID
dedup_df = dedup_df.withColumn("uid", monotonically_increasing_id())
dedup_df.show(4)

+--------------------+-----------+-----+--------------------+--------+------------+
|                text|         id|label|           lemm_text|is_blank|         uid|
+--------------------+-----------+-----+--------------------+--------+------------+
|Fresh install of ...| 1018769417|  1.0|fresh install new...|   False|231928233984|
|"Did I really nee...|12358025545|  1.0|do need learn bou...|   False|343597383680|
|"Literally six we...|10298589026|  1.0|week take ssc cha...|   False|695784701952|
|hi all - i'm goin...| 1208319583|  1.0|go tweet thing lo...|   False|927712935936|
+--------------------+-----------+-----+--------------------+--------+------------+
only showing top 4 rows



In [45]:
#creating final data
data = dedup_df.select('uid','id', 'text','label')
data.show(4)

+------------+-----------+--------------------+-----+
|         uid|         id|                text|label|
+------------+-----------+--------------------+-----+
|231928233984| 1018769417|Fresh install of ...|  1.0|
|343597383680|12358025545|"Did I really nee...|  1.0|
|695784701952|10298589026|"Literally six we...|  1.0|
|927712935936| 1208319583|hi all - i'm goin...|  1.0|
+------------+-----------+--------------------+-----+
only showing top 4 rows



In [46]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4])

In [47]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.feature import CountVectorizer

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and nb.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures")
# vectorizer = CountVectorizer(inputCol= "words", outputCol="rawFeatures")
idf = IDF(minDocFreq=3, inputCol="rawFeatures", outputCol="features")

# Naive Bayes model
nb = NaiveBayes()

# Pipeline Architecture
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

In [48]:
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("text", "label", "prediction").show(5,False)

+-------------------------------------------------------------------------------------+-----+----------+
|text                                                                                 |label|prediction|
+-------------------------------------------------------------------------------------+-----+----------+
|Fresh install of XP on new computer. Sweet relief! fuck vista                        |1.0  |0.0       |
|'Cheap Eats in SLP' - http://t.co/4w8gRp7                                            |1.0  |0.0       |
|Well. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl|1.0  |0.0       |
|Teenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW         |1.0  |0.0       |
+-------------------------------------------------------------------------------------+-----+----------+



In [49]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.0

# References:
    
https://runawayhorse001.github.io/LearningApacheSpark/textmining.html