In [122]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os


from pyspark.ml.feature import CountVectorizer,HashingTF
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC,LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
#Spark version and Spark NLP version should be align

In [2]:
spark = SparkSession.builder\
    .appName("nlp")\
    .master("local[11]")\
    .config("spark.driver.memory","10G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

22/06/16 20:41:19 WARN Utils: Your hostname, winware resolves to a loopback address: 127.0.1.1; using 172.30.234.12 instead (on interface eth0)
22/06/16 20:41:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/06/16 20:41:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [78]:
df = spark.read.parquet("data/")

In [79]:
df.show(10)

+--------------------+--------------------+------------------+
|               value|          pos_tokens|            gender|
+--------------------+--------------------+------------------+
|he began his flig...|[begin, transitio...| 0.999500249875062|
|he built and flew...|[build, fly, spec...| 0.999500249875062|
|his aircraft cons...|[revolutionize, f...| 0.999000999000999|
|the german pilot ...|[german, win, fly...| 0.999000999000999|
|she was a member ...|[communist, name,...|-0.999500249875062|
|she is best known...|     [know, british]|-0.999500249875062|
|he currently work...|              [work]| 0.999000999000999|
|he was recruited ...|[recruit, play, w...| 0.999666777740753|
|on the back of th...|           [recruit]| 0.999000999000999|
|starcevich made h...|  [starcevich, make]| 0.999000999000999|
+--------------------+--------------------+------------------+
only showing top 10 rows



In [80]:
df = df.withColumn("label", (F.col("gender")>0).cast("integer"))

In [None]:
train, test = df.randomSplit([0.8, 0.2])

In [133]:
print(f'Train {train.count()} , Test {test.count()}')

[Stage 3659:>                                                     (0 + 11) / 11]

Train 1043650 , Test 261077


                                                                                

In [154]:
vectorize = CountVectorizer(vocabSize=10_000,minTF=0.03,maxDF=0.90).setInputCol("pos_tokens").setOutputCol("TF")
tfidf = IDF().setInputCol("TF").setOutputCol("features")
svm = LinearSVC()
pipeline = Pipeline(stages=[vectorize,tfidf,svm])

                                                                                

In [141]:
model = pipeline.fit(train)
result = model.transform(test)

                                                                                

In [146]:
result.select("prediction").toPandas().mean()

                                                                                

prediction    1.0
dtype: float64

In [174]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedFMeasure")
print(evaluator.evaluate(result))
vectorizer = model.stages[0]
_vocab = vectorizer.vocabulary
_model = model.stages[-1]
_betas = _model.coefficients
vocab_df = pd.DataFrame({"vocab":_vocab,"score":_betas})
vocab_df.shape
vocab_df = vocab_df.sort_values(by="score")

                                                                                

0.7661864478953011


In [175]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(vectorize.minDF, [10_000])
             .addGrid(svm.regParam, [0.01])
             .build())
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel = cv.fit(train)
predictions = cvModel.transform(test)
evaluator.evaluate(predictions)

                                                                                

0.7661864478953011

In [167]:
evaluator = MulticlassClassificationEvaluator()
print(evaluator.evaluate(predictions))


pipeline_model = cvModel.bestModel

print(pipeline_model.stages)
vectorizer = pipeline_model.stages[0]
_vocab = vectorizer.vocabulary
_model = pipeline_model.stages[-1]
_betas = _model.coefficients
vocab_df = pd.DataFrame({"vocab":_vocab,"score":_betas})
vocab_df.shape
vocab_df = vocab_df.sort_values(by="score")



0.788123849132682
[CountVectorizerModel: uid=CountVectorizer_e5efb5612c55, vocabularySize=10000, IDFModel: uid=IDF_39a049c8a75e, numDocs=1043650, numFeatures=10000, LinearSVCModel: uid=LinearSVC_9e42cf496448, numClasses=2, numFeatures=10000]


                                                                                

In [172]:
vocab_df.head(10)

Unnamed: 0,vocab,score
367,female,-0.309803
563,daughter,-0.289785
747,peak,-0.273862
1089,sister,-0.25758
1105,universe,-0.255272
1293,itf,-0.249274
1314,woman,-0.249006
1433,model,-0.244861
1411,lesbian,-0.244041
1621,mother,-0.240923


In [171]:
vocab_df.tail(10)

Unnamed: 0,vocab,score
5892,automated,0.19333
5887,countryman,0.194652
5599,nathaniel,0.195567
5088,slovenia,0.198976
4774,repatriate,0.201093
4529,greatgrandson,0.202268
5661,optimal,0.209567
3518,atp,0.211054
5771,trainer,0.241347
8223,munnetra,0.35758
