In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os


from pyspark.ml.feature import CountVectorizer,HashingTF
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
#Spark version and Spark NLP version should be align

In [2]:
spark = SparkSession.builder\
    .appName("nlp")\
    .master("local[11]")\
    .config("spark.driver.memory","10G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

22/06/16 16:53:20 WARN Utils: Your hostname, winware resolves to a loopback address: 127.0.1.1; using 172.30.234.12 instead (on interface eth0)
22/06/16 16:53:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/06/16 16:53:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
df = spark.read.parquet("data/")

In [4]:
df = df.withColumn("label", (F.col("gender")>0).cast("integer"))

In [5]:
vectorize = CountVectorizer().setInputCol("pos_tokens").setOutputCol("TF")
tfidf = IDF().setInputCol("TF").setOutputCol("features")
svm = LinearSVC(maxIter=50, regParam=0.1)

pipeline = Pipeline(stages=[vectorize,tfidf,svm])

In [6]:
train, test = df.randomSplit([0.8, 0.2])

In [7]:
model = pipeline.fit(train)

                                                                                

In [8]:
result = model.transform(test)

In [9]:
evaluator = MulticlassClassificationEvaluator()

In [10]:
evaluator.evaluate(result)

                                                                                

0.7869575332608502

In [11]:
vectorizer = model.stages[0]

In [12]:
vocab = vectorizer.vocabulary

In [13]:
svm_model = model.stages[-1]

In [14]:
betas = svm_model.coefficients

In [15]:
vocab_df = pd.DataFrame({"vocab":vocab,"score":betas})

In [17]:
vocab_df.sort_values(by="score").tail(20)

Unnamed: 0,vocab,score
88990,gülüstü,0.22833
42606,rafferty,0.231426
107980,chudinov,0.235383
73675,fennville,0.236
45044,wiebe,0.242517
53676,johnpaul,0.242753
36835,caserta,0.255213
76038,swaran,0.256366
47244,cachaco,0.256549
51395,macropsychic,0.261617
