In [148]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover
import pyspark as ps
import warnings
from pyspark.sql import SQLContext
from pyspark import SparkConf
from pyspark import SparkContext
import pandas as pd

In [150]:
sc.stop()


In [151]:
try:
    # creating spark context
    sc = ps.SparkContext('local[2]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")


Just created a SparkContext


In [152]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('C:/Users/Suraj  kumar/Desktop/Forsk_Lab/Sentimental_Analysis/labeledTrainData.tsv',delimiter='\t' )
type(df)

pyspark.sql.dataframe.DataFrame

In [153]:
df.show(5)

+------+---------+--------------------+
|    id|sentiment|              review|
+------+---------+--------------------+
|5814_8|        1|With all this stu...|
|2381_9|        1|"The Classic War ...|
|7759_3|        0|The film starts w...|
|3630_4|        0|It must be assume...|
|9495_8|        1|Superbly trashy a...|
+------+---------+--------------------+
only showing top 5 rows



In [154]:
data = df.select("review",col("sentiment").cast("Int").alias("label"))
data.show(5)

+--------------------+-----+
|              review|label|
+--------------------+-----+
|With all this stu...|    1|
|"The Classic War ...|    1|
|The film starts w...|    0|
|It must be assume...|    0|
|Superbly trashy a...|    1|
+--------------------+-----+
only showing top 5 rows



In [164]:
dividedData = data.randomSplit([0.8, 0.2]) 
train_Data = dividedData[0] #index 0 = data training
test_Data = dividedData[1] #index 1 = data testing
train_rows = train_Data.count()
test_rows = test_Data.count()
print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

Training data rows: 19976 ; Testing data rows: 5024


In [165]:
tokenizer = Tokenizer(inputCol="review", outputCol="SentimentWords")
tokenizedTrain = tokenizer.transform(train_Data)
tokenizedTrain.show(5)
type(tokenizedTrain)


+--------------------+-----+--------------------+
|              review|label|      SentimentWords|
+--------------------+-----+--------------------+
|A Turkish Bat...|    1|[a, turkish, ...|
|!!!! POSSIBLE MIL...|    0|[!!!!, possible, ...|
|!!!!! OF COURSE T...|    0|[!!!!!, of, cours...|
|" Så som i himmel...|    1|[", så, som, i, h...|
|"54" is a film ba...|    0|["54", is, a, fil...|
+--------------------+-----+--------------------+
only showing top 5 rows



pyspark.sql.dataframe.DataFrame

In [166]:
#stopword remover
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(5)

+--------------------+-----+--------------------+--------------------+
|              review|label|      SentimentWords|     MeaningfulWords|
+--------------------+-----+--------------------+--------------------+
|A Turkish Bat...|    1|[a, turkish, ...|[a, turkish, ...|
|!!!! POSSIBLE MIL...|    0|[!!!!, possible, ...|[!!!!, possible, ...|
|!!!!! OF COURSE T...|    0|[!!!!!, of, cours...|[!!!!!, course, s...|
|" Så som i himmel...|    1|[", så, som, i, h...|[", så, som, himm...|
|"54" is a film ba...|    0|["54", is, a, fil...|["54", film, base...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



In [167]:
#Converting words feature into numerical feature.Austin Appleby's MurmurHash 3 algorithm
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericTrainData = hashTF.transform(SwRemovedTrain).select(
    'label', 'MeaningfulWords', 'features')
numericTrainData.show(5)

+-----+--------------------+--------------------+
|label|     MeaningfulWords|            features|
+-----+--------------------+--------------------+
|    1|[a, turkish, ...|(262144,[1536,232...|
|    0|[!!!!, possible, ...|(262144,[571,1536...|
|    0|[!!!!!, course, s...|(262144,[1536,559...|
|    1|[", så, som, himm...|(262144,[1998,208...|
|    0|["54", film, base...|(262144,[14,6286,...|
+-----+--------------------+--------------------+
only showing top 5 rows



In [177]:
lr = LogisticRegression(labelCol="label", featuresCol="features", 
                        maxIter=10, regParam=0.01)
model = lr.fit(numericTrainData)
print ("Training is done!")

Training is done!


In [169]:
#Testing data
tokenizedTest = tokenizer.transform(test_Data)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select(
    'Label', 'MeaningfulWords', 'features')
numericTest.show(5)

+-----+--------------------+--------------------+
|Label|     MeaningfulWords|            features|
+-----+--------------------+--------------------+
|    0|[!!!!!, possible,...|(262144,[991,1536...|
|    0|["a, town, called...|(262144,[1294,199...|
|    1|["a, little, nons...|(262144,[6500,718...|
|    1|["a, truly, nice,...|(262144,[571,1492...|
|    1|["addictive", adj...|(262144,[14,571,6...|
+-----+--------------------+--------------------+
only showing top 5 rows



In [170]:
#predicting testing data
prediction = model.transform(numericTest)
predictionFinal = prediction.select(
    "MeaningfulWords", "prediction", "Label")
predictionFinal.show(5)

+--------------------+----------+-----+
|     MeaningfulWords|prediction|Label|
+--------------------+----------+-----+
|[!!!!!, possible,...|       0.0|    0|
|["a, town, called...|       0.0|    0|
|["a, little, nons...|       1.0|    1|
|["a, truly, nice,...|       1.0|    1|
|["addictive", adj...|       1.0|    1|
+--------------------+----------+-----+
only showing top 5 rows



In [171]:
#accuracy score
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['Label']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData, 
      ", accuracy:", correctPrediction/totalData)

correct prediction: 4305 , total data: 5024 , accuracy: 0.8568869426751592


In [None]:
sc.stop()

In [174]:
comment="good movie"
from pyspark.sql.types import *
cSchema = StructType([StructField("review", StringType())]) 
test_list = [[comment]]
new_data= sqlContext.createDataFrame(test_list,schema=cSchema)
new_data.show()
tokenizedTest = tokenizer.transform(new_data)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select('features')
numericTest.show()
type(new_data)

+----------+
|    review|
+----------+
|good movie|
+----------+

+--------------------+
|            features|
+--------------------+
|(262144,[113432,1...|
+--------------------+



pyspark.sql.dataframe.DataFrame

In [175]:
prediction = model.transform(numericTest)
predictionFinal = prediction.select("prediction")
predictionFinal.show()
type(predictionFinal)

+----------+
|prediction|
+----------+
|       1.0|
+----------+



pyspark.sql.dataframe.DataFrame

In [176]:
predictionFinal.toPandas()

Unnamed: 0,prediction
0,1.0


pyspark.sql.dataframe.DataFrame