<h1>Identify the Sentiments</h1>

Sentiment analysis is contextual mining of text which identifies and extracts subjective information in source material, and helping a business to understand the social sentiment of their brand, product or service while monitoring online conversations. Brands can use this data to measure the success of their products in an objective manner. In this project, you are provided with tweet data to predict sentiment on electronic products of netizens.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType,IntegerType
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pattern.en import sentiment, mood, modality
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer
import re
import numpy as np
import time

In [2]:
train=spark.read.csv("train_2kmZucJ - Copy.csv",header=True)


In [3]:
train.show(5)

+---+------+--------------------+
| id|target|               tweet|
+---+------+--------------------+
|  1|     0|#fingerprint #Pre...|
|  2|     0|Finally a transpa...|
|  3|     0|We love this! Wou...|
|  4|     0|I'm wired I know ...|
|  5|     1|What amazing serv...|
+---+------+--------------------+
only showing top 5 rows



In [4]:
%%time
stopWords = stopwords.words('english')+['mr', 'mrs', 'come', 'go', 'get',
                             'tell', 'listen', 'one', 'two', 'three',
                             'four', 'five', 'six', 'seven', 'eight',
                             'nine', 'zero', 'join', 'find', 'make',
                             'say', 'ask', 'tell', 'see', 'try', 'back',
                             'also','oneplus','apple','android','iphone','samsung','sony']

def tokenize_text(text):
    tokens = word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopWords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


def sent_TokenizeFunct(x):
    return nltk.sent_tokenize(x)

def word_TokenizeFunct(x):
    splitted = [word for line in x for word in line.split()]
    return splitted

def removeStopWordsFunct(x):
    from nltk.corpus import stopwords
    stop_words=set(stopwords.words('english'))
    filteredSentence = [w for w in x if not w in stop_words]
    return filteredSentence


sent_udf=udf(sent_TokenizeFunct)
#sent_udf=udf(lambda x: sent_tokenize(x),StringType())
word_udf=udf(word_TokenizeFunct)
stop_udf=udf(remove_stopwords)
#senti_udf=udf(lambda x: sentiment(x),StringType())
#mod_udf=udf(lambda x: modality(x),StringType())
#upp_udf=udf(lambda x: len([wrd for wrd in x.split() if wrd.isupper()],StringType())
commas_udf=udf(lambda x: x.count(','),IntegerType())
hash_udf=udf(lambda x: x.count('#'),IntegerType())
exlametary_udf=udf(lambda x: x.count('!'),IntegerType())
aderate_udf=udf(lambda x: x.count('@'),IntegerType())
question_udf=udf(lambda x: x.count('?'),IntegerType())
wordnet_lemmatizer = WordNetLemmatizer()
lem_udf= udf(lambda x: ' '.join([wordnet_lemmatizer.lemmatize(l) for l in x.split()]),StringType())
words_only_udf=udf(lambda x: re.sub(r'[^\w\s]','', x.lower()),StringType())
single_words_udf=udf(lambda x: re.sub(r'\b\w{1,1}\b', '', x.lower()),StringType())
length_udf = udf(lambda x: len(x),IntegerType())
#get number of words
words_count_udf = udf(lambda x: len(x.split(' ')),IntegerType())
#df['word_density'] = df['length'] / (df['words']+1)
words_not_stopword_udf = udf(lambda x: len([t for t in x.split(' ') if t not in stopWords]),IntegerType())
#get the average word length
avg_word_length_udf = udf(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0,IntegerType())
stopword_count_udf = udf(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stopWords]),IntegerType())
    
    
train_df_1= train.withColumn('tweet', regexp_replace('tweet', '$&@*#', 'profaneword'))
train_df_1= train_df_1.withColumn('tweet', regexp_replace('tweet', "http\S+", "link"))
train_df_1=train_df_1.withColumn("comma_count", commas_udf("tweet"))
train_df_1=train_df_1.withColumn("hash_count", hash_udf("tweet"))
train_df_1=train_df_1.withColumn("exlametary_count", exlametary_udf("tweet"))
train_df_1=train_df_1.withColumn("aderate_count", aderate_udf("tweet"))
train_df_1=train_df_1.withColumn("question_count", question_udf("tweet"))

train_df_1=train_df_1.withColumn("text", lem_udf("tweet"))
train_df_1=train_df_1.withColumn("text", words_only_udf("text"))
train_df_1=train_df_1.withColumn("text", single_words_udf("text"))
train_df_1= train_df_1.withColumn('text', regexp_replace('text', "\d+", ""))
train_df_1=train_df_1.withColumn("length", length_udf("text"))
train_df_1=train_df_1.withColumn("words_count", words_count_udf("text"))
#train_df_1=train_df_1.withColumn("words_density",train_df_1.length /train_df_1.length.words)
train_df_1=train_df_1.withColumn("words_not_stopword", words_not_stopword_udf("text"))
#train_df_1=train_df_1.withColumn("avg_word_length", avg_word_length_udf("text"))
train_df_1=train_df_1.withColumn("stopword_count", stopword_count_udf("text"))


train_df_1.show(5)


+---+------+--------------------+-----------+----------+----------------+-------------+--------------+--------------------+------+-----------+------------------+--------------+
| id|target|               tweet|comma_count|hash_count|exlametary_count|aderate_count|question_count|                text|length|words_count|words_not_stopword|stopword_count|
+---+------+--------------------+-----------+----------+----------------+-------------+--------------+--------------------+------+-----------+------------------+--------------+
|  1|     0|#fingerprint #Pre...|          0|        11|               0|            0|             0|fingerprint pregn...|   100|         13|                11|             2|
|  2|     0|Finally a transpa...|          0|         5|               0|            0|             0|finally  transpar...|    89|         17|                14|             3|
|  3|     0|We love this! Wou...|          0|         8|               1|            0|             1|we love this 

In [5]:
indexer = StringIndexer(inputCol="target", outputCol="label")
indexed = indexer.fit(train_df_1).transform(train_df_1)
#indexed.show()

In [6]:
train_df_2=indexed.select([c for c in indexed.columns if c not in ['id','target','tweet']])

In [7]:
(train_set, val_set, test_set) = train_df_2.randomSplit([0.98, 0.01, 0.01], seed = 2000)

In [8]:
%%time
from pyspark.ml.evaluation import BinaryClassificationEvaluator

tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

lsvc=LinearSVC(maxIter=5)
pipeline = Pipeline(stages=[tokenizer, hashtf, idf,lsvc])
pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(val_set)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("SVM Accuracy Score: {0:.4f}".format(accuracy))
print("SVM ROC-AUC: {0:.4f}".format(roc_auc))

SVM Accuracy Score: 0.8824
SVM ROC-AUC: 0.9510
Wall time: 1min 39s


In [9]:
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[tokenizer, hashtf, idf,lr])
pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(val_set)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Logistic Accuracy Score: {0:.4f}".format(accuracy))
print("Logistic ROC-AUC: {0:.4f}".format(roc_auc))

Logistic Accuracy Score: 0.8235
Logistic ROC-AUC: 0.8781
