In [3]:
# Importing the relevant libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import desc
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
MAX_MEMORY = "5G"
spark = SparkSession.builder.appName('twitter').master("local[*]").config("spark.executor.memoryOverhead",MAX_MEMORY).config("spark.executor.memory", MAX_MEMORY).config("spark.driver.memory", MAX_MEMORY).getOrCreate()



In [5]:
#Remember to add the path to your Amazon s3 bucket where you uploaded your dataset
df = spark.read \
	.format("csv") \
 	.option("header","true") \
 	.option("mode", "DROPMALFORMED") \
 	.option("inferSchema", "true") \
 	.load("s3a://...your bucket name.../..your file name.csv")

In [6]:
# count after drop malformed records
print("Rows count: {0}".format(df.count()))

Rows count: 17426


In [7]:
df.show()

+---------+-------+-----------+------------------+-----------------+-------+-----------------+----------+---------------------+--------------+--------------------+----------+-----------+----------+---------------+---------------+--------------------+-------------+-------------+--------------------+-----------+-----------+--------------+----------+--------------------+--------------------+
| _unit_id|_golden|_unit_state|_trusted_judgments|_last_judgment_at| gender|gender:confidence|profile_yn|profile_yn:confidence|       created|         description|fav_number|gender_gold|link_color|           name|profile_yn_gold|        profileimage|retweet_count|sidebar_color|                text|tweet_coord|tweet_count| tweet_created|  tweet_id|      tweet_location|       user_timezone|
+---------+-------+-----------+------------------+-----------------+-------+-----------------+----------+---------------------+--------------+--------------------+----------+-----------+----------+---------------+---

In [8]:
df = df.filter(col("gender").isin(['male','female','brand'])).select("_unit_id","gender","description")

In [9]:
df.show()
# count after drop malformed records
print("Rows count after filtering unknown gender: {0}".format(df.count()))

+---------+------+--------------------+
| _unit_id|gender|         description|
+---------+------+--------------------+
|815719226|  male|i sing my own rhy...|
|815719227|  male|I'm the author of...|
|815719228|  male|louis whining and...|
|815719229|  male|Mobile guy.  49er...|
|815719230|female|Ricky Wilson The ...|
|815719231|female|  you don't know me.|
|815719232| brand|A global marketpl...|
|815719233|  male|The secret of get...|
|815719234|female|Pll Fan // Crazy ...|
|815719235|female|Renaissance art h...|
|815719236| brand|Clean food that t...|
|815719237| brand|highly extraordin...|
|815719238|female|Senior '16 . XI-X...|
|815719239| brand|Come join the fas...|
|815719240|female|im just here for ...|
|815719241|female|                null|
|815719242|female|           JMKM�_ҕ��|
|815719243|  male|Over enthusiastic...|
|815719244|  male|                null|
|815719246|female|Artisan specializ...|
+---------+------+--------------------+
only showing top 20 rows

Rows count aft

In [10]:
print("Rows count per gender")
df.groupby("gender").count().show()

Rows count per gender
+------+-----+
|gender|count|
+------+-----+
|female| 5867|
| brand| 5175|
|  male| 5370|
+------+-----+



In [11]:
# dropping the rows if description is null
final_df = df.filter(col("description").isNotNull())

In [12]:
print("**** RAW dataframe ****")
final_df.show()
# count after drop malformed records
print("Rows count after filtering invalid description: {0}".format(final_df.count()))

**** RAW dataframe ****
+---------+------+--------------------+
| _unit_id|gender|         description|
+---------+------+--------------------+
|815719226|  male|i sing my own rhy...|
|815719227|  male|I'm the author of...|
|815719228|  male|louis whining and...|
|815719229|  male|Mobile guy.  49er...|
|815719230|female|Ricky Wilson The ...|
|815719231|female|  you don't know me.|
|815719232| brand|A global marketpl...|
|815719233|  male|The secret of get...|
|815719234|female|Pll Fan // Crazy ...|
|815719235|female|Renaissance art h...|
|815719236| brand|Clean food that t...|
|815719237| brand|highly extraordin...|
|815719238|female|Senior '16 . XI-X...|
|815719239| brand|Come join the fas...|
|815719240|female|im just here for ...|
|815719242|female|           JMKM�_ҕ��|
|815719243|  male|Over enthusiastic...|
|815719246|female|Artisan specializ...|
|815719247|female|He bled and died ...|
|815719248|female|        union j xxxx|
+---------+------+--------------------+
only showing top

In [13]:
# add indexer to transfrom genders into numerical values. Here String Indexer is used
indexer = StringIndexer(inputCol="gender", outputCol="labels").fit(final_df)

In [14]:
indexed = indexer.transform(final_df)

In [15]:
print("**** RAW dataframe with String indexer ****")
indexed.show()

**** RAW dataframe with String indexer ****
+---------+------+--------------------+------+
| _unit_id|gender|         description|labels|
+---------+------+--------------------+------+
|815719226|  male|i sing my own rhy...|   1.0|
|815719227|  male|I'm the author of...|   1.0|
|815719228|  male|louis whining and...|   1.0|
|815719229|  male|Mobile guy.  49er...|   1.0|
|815719230|female|Ricky Wilson The ...|   0.0|
|815719231|female|  you don't know me.|   0.0|
|815719232| brand|A global marketpl...|   2.0|
|815719233|  male|The secret of get...|   1.0|
|815719234|female|Pll Fan // Crazy ...|   0.0|
|815719235|female|Renaissance art h...|   0.0|
|815719236| brand|Clean food that t...|   2.0|
|815719237| brand|highly extraordin...|   2.0|
|815719238|female|Senior '16 . XI-X...|   0.0|
|815719239| brand|Come join the fas...|   2.0|
|815719240|female|im just here for ...|   0.0|
|815719242|female|           JMKM�_ҕ��|   0.0|
|815719243|  male|Over enthusiastic...|   1.0|
|815719246|femal

In [16]:
TOTAL = indexed.count()
TRAIN = 0.75
TEST = 0.25

In [17]:
train = indexed.limit(int(TOTAL*TRAIN))
test = indexed.orderBy(desc("_unit_id")).limit(int(TOTAL*TEST))

In [18]:
print("Total data count {0}".format(TOTAL))
print("Training data count {0}".format(train.count()))
print("Test data count {0}".format(test.count()))

Total data count 13431
Training data count 10073
Test data count 3357


In [19]:
# RegexTokenizer allows more advanced tokenization based on regular expression (regex) matching
regexTokenizer = RegexTokenizer(inputCol="description", outputCol="words", pattern="\\W")

In [20]:
regexTokenized = regexTokenizer.transform(train)
print("**** RAW dataframe with tokenized ****")
regexTokenized.show()

**** RAW dataframe with tokenized ****
+---------+------+--------------------+------+--------------------+
| _unit_id|gender|         description|labels|               words|
+---------+------+--------------------+------+--------------------+
|815719226|  male|i sing my own rhy...|   1.0|[i, sing, my, own...|
|815719227|  male|I'm the author of...|   1.0|[i, m, the, autho...|
|815719228|  male|louis whining and...|   1.0|[louis, whining, ...|
|815719229|  male|Mobile guy.  49er...|   1.0|[mobile, guy, 49e...|
|815719230|female|Ricky Wilson The ...|   0.0|[ricky, wilson, t...|
|815719231|female|  you don't know me.|   0.0|[you, don, t, kno...|
|815719232| brand|A global marketpl...|   2.0|[a, global, marke...|
|815719233|  male|The secret of get...|   1.0|[the, secret, of,...|
|815719234|female|Pll Fan // Crazy ...|   0.0|[pll, fan, crazy,...|
|815719235|female|Renaissance art h...|   0.0|[renaissance, art...|
|815719236| brand|Clean food that t...|   2.0|[clean, food, tha...|
|81571923

Stop words are words which should be excluded from the input, typically because the words appear frequently and don’t carry as much meaning.
StopWordsRemover takes as input a sequence of strings (e.g. the output of a Tokenizer) and drops all the stop words from the input sequences.

In [21]:
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

In [22]:
removed = stop_words_remover.transform(regexTokenized)
print("**** RAW dataframe after emoving stop words ****")
removed.show()

**** RAW dataframe after emoving stop words ****
+---------+------+--------------------+------+--------------------+--------------------+
| _unit_id|gender|         description|labels|               words|      filtered_words|
+---------+------+--------------------+------+--------------------+--------------------+
|815719226|  male|i sing my own rhy...|   1.0|[i, sing, my, own...|      [sing, rhythm]|
|815719227|  male|I'm the author of...|   1.0|[i, m, the, autho...|[m, author, novel...|
|815719228|  male|louis whining and...|   1.0|[louis, whining, ...|[louis, whining, ...|
|815719229|  male|Mobile guy.  49er...|   1.0|[mobile, guy, 49e...|[mobile, guy, 49e...|
|815719230|female|Ricky Wilson The ...|   0.0|[ricky, wilson, t...|[ricky, wilson, b...|
|815719231|female|  you don't know me.|   0.0|[you, don, t, kno...|              [know]|
|815719232| brand|A global marketpl...|   2.0|[a, global, marke...|[global, marketpl...|
|815719233|  male|The secret of get...|   1.0|[the, secret, o

HashingTF is a Transformer which takes sets of terms and converts those sets into fixed-length feature vectors. In text processing, a “set of terms” might be a bag of words. HashingTF utilizes the hashing trick. A raw feature is mapped into an index (term) by applying a hash function

In [23]:
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features").setNumFeatures(11000)

In [24]:
ht_df = hashing_tf.transform(removed)
ht_df.show()

+---------+------+--------------------+------+--------------------+--------------------+--------------------+
| _unit_id|gender|         description|labels|               words|      filtered_words|        raw_features|
+---------+------+--------------------+------+--------------------+--------------------+--------------------+
|815719226|  male|i sing my own rhy...|   1.0|[i, sing, my, own...|      [sing, rhythm]|(11000,[8809,9451...|
|815719227|  male|I'm the author of...|   1.0|[i, m, the, autho...|[m, author, novel...|(11000,[873,978,2...|
|815719228|  male|louis whining and...|   1.0|[louis, whining, ...|[louis, whining, ...|(11000,[1000,2314...|
|815719229|  male|Mobile guy.  49er...|   1.0|[mobile, guy, 49e...|[mobile, guy, 49e...|(11000,[464,929,9...|
|815719230|female|Ricky Wilson The ...|   0.0|[ricky, wilson, t...|[ricky, wilson, b...|(11000,[1040,1215...|
|815719231|female|  you don't know me.|   0.0|[you, don, t, kno...|              [know]|(11000,[9779],[1.0])|
|815719232

 IDF is an Estimator which is fit on a dataset and produces an IDFModel. 
 The IDFModel takes feature vectors (generally created from HashingTF or CountVectorizer) and scales each feature. 
 Intuitively, it down-weights features which appear frequently in a corpus.

In [25]:
idf = IDF(inputCol="raw_features", outputCol="features")

In [26]:
idfmodel = idf.fit(ht_df)
idf_df = idfmodel.transform(ht_df)
print("**** RAW dataframe after adding idf and hashingtf ****")
idf_df.show()

**** RAW dataframe after adding idf and hashingtf ****
+---------+------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
| _unit_id|gender|         description|labels|               words|      filtered_words|        raw_features|            features|
+---------+------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|815719226|  male|i sing my own rhy...|   1.0|[i, sing, my, own...|      [sing, rhythm]|(11000,[8809,9451...|(11000,[8809,9451...|
|815719227|  male|I'm the author of...|   1.0|[i, m, the, autho...|[m, author, novel...|(11000,[873,978,2...|(11000,[873,978,2...|
|815719228|  male|louis whining and...|   1.0|[louis, whining, ...|[louis, whining, ...|(11000,[1000,2314...|(11000,[1000,2314...|
|815719229|  male|Mobile guy.  49er...|   1.0|[mobile, guy, 49e...|[mobile, guy, 49e...|(11000,[464,929,9...|(11000,[464,929,9...|
|815719230|female|Ricky Wils

In [27]:
# Train a RandomForest model.
rf = RandomForestClassifier(
		labelCol="labels", 
		featuresCol="features", 
		impurity='gini',
		maxDepth=12,
		numTrees=256,
		featureSubsetStrategy='auto',
		seed=5043)

In [28]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=indexer.labels)

In [29]:
# create pipeline of all the transformations 
pipeline = Pipeline(stages=[regexTokenizer, stop_words_remover, hashing_tf, idf, rf, labelConverter])

In [30]:
# Train model.  This also runs the indexers.
model=pipeline.fit(train)

In [31]:
# Make predictions.
predictions = model.transform(test)

In [32]:

print("**** Prediction on test data with MODEL RandomForestClassifier ****")
predictions.select("_unit_id","gender","description","prediction","predictedLabel").show()

print("**** Accuracy Metrics ****")
evaluator = MulticlassClassificationEvaluator(
    labelCol="labels", predictionCol="prediction")
accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
f1_score = evaluator.setMetricName("f1").evaluate(predictions)
weighted_precesion = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
weighted_recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)


print("Accuracy: {0}".format(accuracy))
print("f1_score: {0}".format(f1_score))
print("weighted_precesion: {0}".format(weighted_precesion))
print("weighted_recall: {0}".format(weighted_recall))

**** Prediction on test data with MODEL RandomForestClassifier ****
+---------+------+--------------------+----------+--------------+
| _unit_id|gender|         description|prediction|predictedLabel|
+---------+------+--------------------+----------+--------------+
|815757985|female|Teamwork makes th...|       0.0|        female|
|815757921|female|Anti-statist; I h...|       0.0|        female|
|815757830|  male|#TeamBarcelona .....|       0.0|        female|
|815757681|  male|Whatever you like...|       0.0|        female|
|815757572|female|                (rp)|       0.0|        female|
|815756767|female|I Love Me...Mysel...|       0.0|        female|
|815756700|  male|Head Chef, Chez B...|       0.0|        female|
|815756642| brand|Reviews of delect...|       0.0|        female|
|815756542| brand|When families go ...|       1.0|          male|
|815756417|  male|Houston Chronicle...|       0.0|        female|
|815756332|female|You can find me w...|       0.0|        female|
|8157562