In [2]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("jupyter_Spark").setMaster("yarn-client")
sc = SparkContext(conf=conf)
sc

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import desc
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName('twitter').getOrCreate()


In [7]:
#Remember to add your Amazon s3 bucket path while reading the file
df = spark.read \
	.format("csv") \
 	.option("header","true") \
 	.option("mode", "DROPMALFORMED") \
 	.option("inferSchema", "true") \
 	.load("s3a://...your s3 bucket name.../...your file name.csv")

In [8]:
print("Rows count : {0}".format(df.count()))

Rows count : 17426


In [9]:
df.show()

+---------+-------+-----------+------------------+-----------------+-------+-----------------+----------+---------------------+--------------+--------------------+----------+-----------+----------+---------------+---------------+--------------------+-------------+-------------+--------------------+-----------+-----------+--------------+----------+--------------------+--------------------+
| _unit_id|_golden|_unit_state|_trusted_judgments|_last_judgment_at| gender|gender:confidence|profile_yn|profile_yn:confidence|       created|         description|fav_number|gender_gold|link_color|           name|profile_yn_gold|        profileimage|retweet_count|sidebar_color|                text|tweet_coord|tweet_count| tweet_created|  tweet_id|      tweet_location|       user_timezone|
+---------+-------+-----------+------------------+-----------------+-------+-----------------+----------+---------------------+--------------+--------------------+----------+-----------+----------+---------------+---

In [10]:
df = df.filter(col("gender").isin(['male','female','brand'])).select("_unit_id","gender","description")

In [11]:
df.show()

+---------+------+--------------------+
| _unit_id|gender|         description|
+---------+------+--------------------+
|815719226|  male|i sing my own rhy...|
|815719227|  male|I'm the author of...|
|815719228|  male|louis whining and...|
|815719229|  male|Mobile guy.  49er...|
|815719230|female|Ricky Wilson The ...|
|815719231|female|  you don't know me.|
|815719232| brand|A global marketpl...|
|815719233|  male|The secret of get...|
|815719234|female|Pll Fan // Crazy ...|
|815719235|female|Renaissance art h...|
|815719236| brand|Clean food that t...|
|815719237| brand|highly extraordin...|
|815719238|female|Senior '16 . XI-X...|
|815719239| brand|Come join the fas...|
|815719240|female|im just here for ...|
|815719241|female|                null|
|815719242|female|           JMKM�_ҕ��|
|815719243|  male|Over enthusiastic...|
|815719244|  male|                null|
|815719246|female|Artisan specializ...|
+---------+------+--------------------+
only showing top 20 rows



In [12]:
# count after drop malformed records
print("Rows count after filtering unknown gender: {0}".format(df.count()))

Rows count after filtering unknown gender: 16412


In [13]:
print("Rows count per gender")
df.groupby("gender").count().show()

Rows count per gender
+------+-----+
|gender|count|
+------+-----+
|female| 5867|
| brand| 5175|
|  male| 5370|
+------+-----+



In [14]:
final_df = df.filter(col("description").isNotNull())

In [15]:
print("**** RAW dataframe ****")
final_df.show()
# count after drop malformed records
print("Rows count after filtering invalid description: {0}".format(final_df.count()))

**** RAW dataframe ****
+---------+------+--------------------+
| _unit_id|gender|         description|
+---------+------+--------------------+
|815719226|  male|i sing my own rhy...|
|815719227|  male|I'm the author of...|
|815719228|  male|louis whining and...|
|815719229|  male|Mobile guy.  49er...|
|815719230|female|Ricky Wilson The ...|
|815719231|female|  you don't know me.|
|815719232| brand|A global marketpl...|
|815719233|  male|The secret of get...|
|815719234|female|Pll Fan // Crazy ...|
|815719235|female|Renaissance art h...|
|815719236| brand|Clean food that t...|
|815719237| brand|highly extraordin...|
|815719238|female|Senior '16 . XI-X...|
|815719239| brand|Come join the fas...|
|815719240|female|im just here for ...|
|815719242|female|           JMKM�_ҕ��|
|815719243|  male|Over enthusiastic...|
|815719246|female|Artisan specializ...|
|815719247|female|He bled and died ...|
|815719248|female|        union j xxxx|
+---------+------+--------------------+
only showing top

In [16]:
indexer = StringIndexer(inputCol="gender", outputCol="labels").fit(final_df)

indexed = indexer.transform(final_df)

In [17]:
print("**** RAW dataframe with String indexer ****")
indexed.show()

**** RAW dataframe with String indexer ****
+---------+------+--------------------+------+
| _unit_id|gender|         description|labels|
+---------+------+--------------------+------+
|815719226|  male|i sing my own rhy...|   1.0|
|815719227|  male|I'm the author of...|   1.0|
|815719228|  male|louis whining and...|   1.0|
|815719229|  male|Mobile guy.  49er...|   1.0|
|815719230|female|Ricky Wilson The ...|   0.0|
|815719231|female|  you don't know me.|   0.0|
|815719232| brand|A global marketpl...|   2.0|
|815719233|  male|The secret of get...|   1.0|
|815719234|female|Pll Fan // Crazy ...|   0.0|
|815719235|female|Renaissance art h...|   0.0|
|815719236| brand|Clean food that t...|   2.0|
|815719237| brand|highly extraordin...|   2.0|
|815719238|female|Senior '16 . XI-X...|   0.0|
|815719239| brand|Come join the fas...|   2.0|
|815719240|female|im just here for ...|   0.0|
|815719242|female|           JMKM�_ҕ��|   0.0|
|815719243|  male|Over enthusiastic...|   1.0|
|815719246|femal

In [18]:
TOTAL = indexed.count()
TRAIN = 0.75
TEST = 0.25

In [19]:
(train, test) = indexed.randomSplit([TRAIN, TEST])


In [20]:
print("Total data count {0}".format(TOTAL))
print("Training data count {0}".format(train.count()))
print("Test data count {0}".format(test.count()))

Total data count 13431
Training data count 10051
Test data count 3380


In [21]:
# RegexTokenizer allows more advanced tokenization based on regular expression (regex) matching
regexTokenizer = RegexTokenizer(inputCol="description", outputCol="words", pattern="\\W")

In [22]:
regexTokenized = regexTokenizer.transform(train)

In [23]:
print("**** RAW dataframe with tokenized ****")
regexTokenized.show()


**** RAW dataframe with tokenized ****
+---------+------+--------------------+------+--------------------+
| _unit_id|gender|         description|labels|               words|
+---------+------+--------------------+------+--------------------+
|815719226|  male|i sing my own rhy...|   1.0|[i, sing, my, own...|
|815719227|  male|I'm the author of...|   1.0|[i, m, the, autho...|
|815719228|  male|louis whining and...|   1.0|[louis, whining, ...|
|815719229|  male|Mobile guy.  49er...|   1.0|[mobile, guy, 49e...|
|815719230|female|Ricky Wilson The ...|   0.0|[ricky, wilson, t...|
|815719233|  male|The secret of get...|   1.0|[the, secret, of,...|
|815719234|female|Pll Fan // Crazy ...|   0.0|[pll, fan, crazy,...|
|815719235|female|Renaissance art h...|   0.0|[renaissance, art...|
|815719236| brand|Clean food that t...|   2.0|[clean, food, tha...|
|815719237| brand|highly extraordin...|   2.0|[highly, extraord...|
|815719238|female|Senior '16 . XI-X...|   0.0|[senior, 16, xi, ...|
|81571923

In [24]:
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
removed = stop_words_remover.transform(regexTokenized)
print("**** RAW dataframe after emoving stop words ****")
removed.show()

**** RAW dataframe after emoving stop words ****
+---------+------+--------------------+------+--------------------+--------------------+
| _unit_id|gender|         description|labels|               words|      filtered_words|
+---------+------+--------------------+------+--------------------+--------------------+
|815719226|  male|i sing my own rhy...|   1.0|[i, sing, my, own...|      [sing, rhythm]|
|815719227|  male|I'm the author of...|   1.0|[i, m, the, autho...|[m, author, novel...|
|815719228|  male|louis whining and...|   1.0|[louis, whining, ...|[louis, whining, ...|
|815719229|  male|Mobile guy.  49er...|   1.0|[mobile, guy, 49e...|[mobile, guy, 49e...|
|815719230|female|Ricky Wilson The ...|   0.0|[ricky, wilson, t...|[ricky, wilson, b...|
|815719233|  male|The secret of get...|   1.0|[the, secret, of,...|[secret, getting,...|
|815719234|female|Pll Fan // Crazy ...|   0.0|[pll, fan, crazy,...|[pll, fan, crazy,...|
|815719235|female|Renaissance art h...|   0.0|[renaissance, a

In [25]:
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features").setNumFeatures(11000)

In [26]:
ht_df = hashing_tf.transform(removed)
ht_df.show()

+---------+------+--------------------+------+--------------------+--------------------+--------------------+
| _unit_id|gender|         description|labels|               words|      filtered_words|        raw_features|
+---------+------+--------------------+------+--------------------+--------------------+--------------------+
|815719226|  male|i sing my own rhy...|   1.0|[i, sing, my, own...|      [sing, rhythm]|(11000,[8809,9451...|
|815719227|  male|I'm the author of...|   1.0|[i, m, the, autho...|[m, author, novel...|(11000,[873,978,2...|
|815719228|  male|louis whining and...|   1.0|[louis, whining, ...|[louis, whining, ...|(11000,[1000,2314...|
|815719229|  male|Mobile guy.  49er...|   1.0|[mobile, guy, 49e...|[mobile, guy, 49e...|(11000,[464,929,9...|
|815719230|female|Ricky Wilson The ...|   0.0|[ricky, wilson, t...|[ricky, wilson, b...|(11000,[1040,1215...|
|815719233|  male|The secret of get...|   1.0|[the, secret, of,...|[secret, getting,...|(11000,[6607,7192...|
|815719234

In [27]:
idf = IDF(inputCol="raw_features", outputCol="features")

idfmodel = idf.fit(ht_df)
idf_df = idfmodel.transform(ht_df)

In [28]:
print("**** RAW dataframe after adding idf and hashingtf ****")
idf_df.show()


**** RAW dataframe after adding idf and hashingtf ****
+---------+------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
| _unit_id|gender|         description|labels|               words|      filtered_words|        raw_features|            features|
+---------+------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|815719226|  male|i sing my own rhy...|   1.0|[i, sing, my, own...|      [sing, rhythm]|(11000,[8809,9451...|(11000,[8809,9451...|
|815719227|  male|I'm the author of...|   1.0|[i, m, the, autho...|[m, author, novel...|(11000,[873,978,2...|(11000,[873,978,2...|
|815719228|  male|louis whining and...|   1.0|[louis, whining, ...|[louis, whining, ...|(11000,[1000,2314...|(11000,[1000,2314...|
|815719229|  male|Mobile guy.  49er...|   1.0|[mobile, guy, 49e...|[mobile, guy, 49e...|(11000,[464,929,9...|(11000,[464,929,9...|
|815719230|female|Ricky Wils

In [29]:
# Train a RandomForest model.
rf = NaiveBayes(
		labelCol="labels", 
		featuresCol="features",
		smoothing=2.0, modelType="multinomial")

In [30]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=indexer.labels)
# create pipeline of all the transformations 
pipeline = Pipeline(stages=[regexTokenizer, stop_words_remover, hashing_tf, idf, rf, labelConverter])


In [31]:
# Train model.  This also runs the indexers.
model = pipeline.fit(train)

In [32]:
# Make predictions.
predictions = model.transform(test)
print("**** Prediction on test data with NaiveBayes Model ****")
predictions.select("_unit_id","gender","description","prediction","predictedLabel").show()

**** Prediction on test data with NaiveBayes Model ****
+---------+------+--------------------+----------+--------------+
| _unit_id|gender|         description|prediction|predictedLabel|
+---------+------+--------------------+----------+--------------+
|815719231|female|  you don't know me.|       0.0|        female|
|815719232| brand|A global marketpl...|       2.0|         brand|
|815719247|female|He bled and died ...|       1.0|          male|
|815719250|  male|BSc economics gra...|       1.0|          male|
|815719251|female|Wife to my Coach....|       0.0|        female|
|815719255|  male|RL/writer | Lewd ...|       0.0|        female|
|815719271|  male|[ Krothedj@gmail....|       2.0|         brand|
|815719273|  male|Just Living Life ...|       1.0|          male|
|815719276|  male|Home Office & Bus...|       2.0|         brand|
|815719278| brand|#Pc #xbox #playst...|       1.0|          male|
|815719283| brand|You Can have What...|       2.0|         brand|
|815719294|female|wa

In [33]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="labels", predictionCol="prediction")
accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
f1_score = evaluator.setMetricName("f1").evaluate(predictions)
weighted_precesion = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
weighted_recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)

In [34]:
print("**** Accuracy Metrics ****")

print("Accuracy: {0}".format(accuracy))
print("f1_score: {0}".format(f1_score))
print("weighted_precesion: {0}".format(weighted_precesion))
print("weighted_recall: {0}".format(weighted_recall))

**** Accuracy Metrics ****
Accuracy: 0.55325443787
f1_score: 0.551805753707
weighted_precesion: 0.551008077286
weighted_recall: 0.55325443787
