**Conducting simple sentiment analysis by identifying positive or negative words using Bing Liu Opinion Lexicon.**

In [1]:
import findspark
findspark.init()

In [2]:
# Importing SparkContext,and StreamingContext
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
# Importing the dataset containing the tweets, the same dataste which was outputted by the last cell of Twitter app.
data = spark.read.format("csv").option("header","true").load("/Users/rashidesai/Downloads/streamoftweetsoutput.csv")

In [5]:
# Converting the dataset into rdd
data_rdd = data.rdd

In [6]:
# Removing the column name associated with the tweets
data_rdd_tweet_only = data_rdd.map(lambda x: x[0])

In [7]:
data_rdd_tweet_only.first()

'RT @SRuhle: There are some very large Trump donors who sit on hospital boards'

In [8]:
# Defining the list of punctuations we will be removing before calculating the sentiment score
punctuations = ["(", "[", ",", ".", "!", "?", ":", ";", "]", ")", "@", "^",'#','&', 'Ä', "Ä¶"]

In [9]:
def eliminate_punctuations(text):
    for char in text:
        if char in punctuations:
          text = text.replace(char,"")
    return text

In [10]:
# removing the punctuations and converting the entire tweet into lower case.
clean = data_rdd_tweet_only.map(lambda text : eliminate_punctuations(text)).map(lambda text : text.lower())

In [11]:
clean.first()

'rt sruhle there are some very large trump donors who sit on hospital boards'

In [12]:
# Importing text file containing the positive words
positive = spark.sparkContext.textFile('/Users/rashidesai/Downloads/positive-words.txt')

In [13]:
# Importing text file containing the negative words
negative = spark.sparkContext.textFile('/Users/rashidesai/Downloads/negative-words.txt')

In [14]:
# Creating a key value pair rdd with key as the tweet and value as the words in the tweet.
clean_pair = clean.map(lambda x : (x,x.split()))

In [15]:
clean_pair.first()

('rt sruhle there are some very large trump donors who sit on hospital boards',
 ['rt',
  'sruhle',
  'there',
  'are',
  'some',
  'very',
  'large',
  'trump',
  'donors',
  'who',
  'sit',
  'on',
  'hospital',
  'boards'])

In [16]:
# Extracting positive words in one list
positive_list = positive.collect()
# Extracting negative words in another list
negative_list = negative.collect()

In [17]:
def sentiment_score(x):
    positive_count = 0
    negative_count = 0
    for words in x:
        if words in positive_list:
            positive_count = positive_count + 1 
        elif words in negative_list:
            negative_count = negative_count + 1
    return(positive_count - negative_count)

In [18]:
ans = clean_pair.map(lambda x : (x[0],sentiment_score(x[1])))

In [19]:
# Collecting the tweet and the sentiment score into the list.
lis = ans.collect()

In [20]:
import pandas as pd   
# Converting list to Dataframe
df = pd.DataFrame(lis)

In [21]:
df.columns = (['Tweet_content','Sentiment Score'])

In [22]:
# Outputting the dataframe as a csv
df.to_csv('/Users/rashidesai/Downloads/result.csv',encoding = 'utf-8',index = False) # Writing the final output to csv

### Atleast 1000 tweets are analysed for a sentiment score
#### Twitter_Read.ipynb - The Tweet Listener code
#### Spark_Demo.ipynb - Spark Streaming Application 
#### Sentiment_Analysis.ipynb - Report the sentiment score of each tweet 
#### result.csv - Output file with Tweet_content and Sentiment score