In [4]:
!pip install spark
!pip install pyspark
!pip install warnings
!pip install langdetect

Collecting spark
  Downloading spark-0.2.1.tar.gz (41 kB)
[K     |████████████████████████████████| 41 kB 30 kB/s  eta 0:00:011
[?25hBuilding wheels for collected packages: spark
  Building wheel for spark (setup.py) ... [?25ldone
[?25h  Created wheel for spark: filename=spark-0.2.1-py3-none-any.whl size=58738 sha256=343d718b20945e68e4f3b65d333db179c2f080fe9c74c058565c37d51c153e76
  Stored in directory: /root/.cache/pip/wheels/4e/0e/f1/164619f9920fb447d294afaae11a7715bd442ded7225953d72
Successfully built spark
Installing collected packages: spark
Successfully installed spark-0.2.1
Collecting pyspark
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 27 kB/s s eta 0:00:01
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 46.3 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[

In [2]:
import pyspark as ps
import warnings
from pyspark.sql import SQLContext
from textblob import TextBlob
from langdetect import detect
import re

ModuleNotFoundError: No module named 'pyspark'

In [7]:
try:
    sc = ps.SparkContext('local[10]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

Just created a SparkContext


In [8]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='false', inferschema='false').load('../input/sentiment140/training.1600000.processed.noemoticon.csv')
type(df)

pyspark.sql.dataframe.DataFrame

In [9]:
df.show(5)

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+---+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [10]:
from pyspark.sql.functions import *

df = df.select('_c5','_c0')
df = df.na.drop()
df = df.withColumn('_c5', trim(col('_c5')))
df = df.filter(col('_c5')!="")
df.show()

+--------------------+---+
|                 _c5|_c0|
+--------------------+---+
|@switchfoot http:...|  0|
|is upset that he ...|  0|
|@Kenichan I dived...|  0|
|my whole body fee...|  0|
|@nationwideclass ...|  0|
|@Kwesidei not the...|  0|
|          Need a hug|  0|
|@LOLTrish hey  lo...|  0|
|@Tatiana_K nope t...|  0|
|@twittera que me ...|  0|
|spring break in p...|  0|
|I just re-pierced...|  0|
|@caregiving I cou...|  0|
|@octolinz16 It it...|  0|
|@smarrison i woul...|  0|
|@iamjazzyfizzle I...|  0|
|Hollis' death sce...|  0|
| about to file taxes|  0|
|@LettyA ahh ive a...|  0|
|@FakerPattyPattz ...|  0|
+--------------------+---+
only showing top 20 rows



In [11]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf, col

@udf(returnType=IntegerType())
def langdetect(tweet):
    try:
        if detect(tweet)=='en':
            return 1
        else:
            return 0
    except:
        return 0
@udf(returnType=StringType())
def clean_tweet(tweet): 
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
    
@udf(returnType=FloatType())
def sentiment_score(tweet):
    return TextBlob(tweet).sentiment.polarity

@udf(returnType=IntegerType())
def get_sentiment(score): 
        if score > 0: 
            return 4
        elif score == 0: 
            return 2
        else: 
            return 0

In [12]:
df=df.withColumn("englishornot", langdetect(col("_c5")))
df = df.filter(col('englishornot')!=0)
df=df.withColumn("cleantweet", clean_tweet(col("_c5")))
df=df.withColumn("polarity", sentiment_score(col("cleantweet")))
df=df.withColumn("output", get_sentiment(col("polarity")))
df.show()

+--------------------+---+------------+--------------------+-----------+------+
|                 _c5|_c0|englishornot|          cleantweet|   polarity|output|
+--------------------+---+------------+--------------------+-----------+------+
|@switchfoot http:...|  0|           1|Awww that s a bum...|        0.2|     4|
|is upset that he ...|  0|           1|is upset that he ...|        0.0|     2|
|@Kenichan I dived...|  0|           1|I dived many time...|        0.5|     4|
|my whole body fee...|  0|           1|my whole body fee...|        0.2|     4|
|@nationwideclass ...|  0|           1|no it s not behav...|     -0.625|     0|
|@Kwesidei not the...|  0|           1|  not the whole crew|        0.2|     4|
|          Need a hug|  0|           1|          Need a hug|        0.0|     2|
|@LOLTrish hey  lo...|  0|           1|hey long time no ...| 0.27333334|     4|
|@Tatiana_K nope t...|  0|           1|K nope they didn ...|        0.0|     2|
|spring break in p...|  0|           1|s

In [14]:
df.show()

+--------------------+---+
|          cleantweet|_c0|
+--------------------+---+
|Awww that s a bum...|  0|
|is upset that he ...|  0|
|I dived many time...|  0|
|my whole body fee...|  0|
|no it s not behav...|  0|
|  not the whole crew|  0|
|          Need a hug|  0|
|hey long time no ...|  0|
|K nope they didn ...|  0|
|spring break in p...|  0|
|I just re pierced...|  0|
|I couldn t bear t...|  0|
|It it counts idk ...|  0|
|i would ve been t...|  0|
|I wish I got to w...|  0|
|Hollis death scen...|  0|
| about to file taxes|  0|
|ahh ive always wa...|  0|
|Oh dear Were you ...|  0|
|i was out most of...|  0|
+--------------------+---+
only showing top 20 rows



In [None]:
df.count()

In [13]:
df=df.select('cleantweet','_c0')

In [15]:
(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed = 2000)

In [16]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="cleantweet", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "_c0", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

+----------+---+-----+--------------------+--------------------+-----+
|cleantweet|_c0|words|                  tf|            features|label|
+----------+---+-----+--------------------+--------------------+-----+
|          |  0|   []|(65536,[52572],[1...|(65536,[52572],[7...|  0.0|
|          |  0|   []|(65536,[52572],[1...|(65536,[52572],[7...|  0.0|
|          |  0|   []|(65536,[52572],[1...|(65536,[52572],[7...|  0.0|
|          |  0|   []|(65536,[52572],[1...|(65536,[52572],[7...|  0.0|
|          |  0|   []|(65536,[52572],[1...|(65536,[52572],[7...|  0.0|
+----------+---+-----+--------------------+--------------------+-----+
only showing top 5 rows



In [1]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

ModuleNotFoundError: No module named 'pyspark'

In [None]:
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
accuracy