In [1]:
spark

In [2]:
sc

In [3]:

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover, IDF

In [4]:
tweet_csv=spark.read.csv("file:///home/alish/pyspark codes/Twitter Data.csv", inferSchema=True, header=True)
tweet_csv.show(n=3)

+-----------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+----------+-------------------+-------------+--------------------+-----------+---------------+--------------+--------------------+
|   tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|      name|negativereason_gold|retweet_count|                text|tweet_coord|  tweet_created|tweet_location|       user_timezone|
+-----------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+----------+-------------------+-------------+--------------------+-----------+---------------+--------------+--------------------+
|5.70306E+17|          neutral|                           1|          null|                     null|Virgin America|                  null|   cairdin|               null|            0|@Virgi

In [5]:
data=tweet_csv.select(
    "text",col("airline_sentiment").cast("String").alias("label"))
data.show(truncate=False, n=5)

+----------------------------------------------------------------------------------------------------------------------------------+--------+
|text                                                                                                                              |label   |
+----------------------------------------------------------------------------------------------------------------------------------+--------+
|@VirginAmerica What @dhepburn said.                                                                                               |neutral |
|@VirginAmerica plus you've added commercials to the experience... tacky.                                                          |positive|
|@VirginAmerica I didn't today... Must mean I need to take another trip!                                                           |neutral |
|"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse"|negative|
|@Virg

In [6]:
label_mapping = {"neutral": 1, "positive": 2, "negative": 0}

data = data.withColumn("encoded_label", 
                          when(col("label") == "neutral", label_mapping["neutral"])
                          .when(col("label") == "positive", label_mapping["positive"])
                          .when(col("label") == "negative", label_mapping["negative"]))

# Show the result
data.show()

+--------------------+--------+-------------+
|                text|   label|encoded_label|
+--------------------+--------+-------------+
|@VirginAmerica Wh...| neutral|            1|
|@VirginAmerica pl...|positive|            2|
|@VirginAmerica I ...| neutral|            1|
|"@VirginAmerica i...|negative|            0|
|@VirginAmerica an...|negative|            0|
|@VirginAmerica se...|negative|            0|
|                null|    null|         null|
|@VirginAmerica ye...|positive|            2|
|@VirginAmerica Re...| neutral|            1|
|@virginamerica We...|positive|            2|
|@VirginAmerica it...|positive|            2|
|@VirginAmerica di...| neutral|            1|
|@VirginAmerica I ...|positive|            2|
|@VirginAmerica Th...|positive|            2|
|@VirginAmerica @v...|positive|            2|
|@VirginAmerica Th...|positive|            2|
|@VirginAmerica SF...|negative|            0|
|@VirginAmerica So...|positive|            2|
|@VirginAmerica  I...|negative|   

In [7]:
data = data.dropna()
data=data.drop('label')

In [8]:
data.show()

+--------------------+-------------+
|                text|encoded_label|
+--------------------+-------------+
|@VirginAmerica Wh...|            1|
|@VirginAmerica pl...|            2|
|@VirginAmerica I ...|            1|
|"@VirginAmerica i...|            0|
|@VirginAmerica an...|            0|
|@VirginAmerica se...|            0|
|@VirginAmerica ye...|            2|
|@VirginAmerica Re...|            1|
|@virginamerica We...|            2|
|@VirginAmerica it...|            2|
|@VirginAmerica di...|            1|
|@VirginAmerica I ...|            2|
|@VirginAmerica Th...|            2|
|@VirginAmerica @v...|            2|
|@VirginAmerica Th...|            2|
|@VirginAmerica SF...|            0|
|@VirginAmerica So...|            2|
|@VirginAmerica  I...|            0|
|I ❤️ flying @Virg...|            2|
|@VirginAmerica yo...|            2|
+--------------------+-------------+
only showing top 20 rows



In [9]:
#Split training and Testing
split_data=data.randomSplit([0.7,0.3])
train=split_data[0]

#label in test renamed to true label
test=split_data[1].withColumnRenamed("encoded_label","true_label")
train_rows=train.count()
test_rows=test.count()

print("Total train :",train_rows)
print("Total test :", test_rows)

Total train : 10240
Total test : 4392


In [10]:
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
tokenizedTrain = tokenizer.transform(train)
tokenizedTrain.show(n=5)

+--------------------+-------------+--------------------+
|                text|encoded_label|              tokens|
+--------------------+-------------+--------------------+
|"""LOL you guys a...|            2|["""lol, you, guy...|
|".@AmericanAir @U...|            0|[".@americanair, ...|
|".@united You may...|            0|[".@united, you, ...|
|"@AmericanAir  so...|            0|["@americanair, ,...|
|"@AmericanAir ""A...|            2|["@americanair, "...|
+--------------------+-------------+--------------------+
only showing top 5 rows



In [11]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show( n=5)

+--------------------+-------------+--------------------+--------------------+
|                text|encoded_label|              tokens|     MeaningfulWords|
+--------------------+-------------+--------------------+--------------------+
|"""LOL you guys a...|            2|["""lol, you, guy...|["""lol, guys, it...|
|".@AmericanAir @U...|            0|[".@americanair, ...|[".@americanair, ...|
|".@united You may...|            0|[".@united, you, ...|[".@united, may, ...|
|"@AmericanAir  so...|            0|["@americanair, ,...|["@americanair, ,...|
|"@AmericanAir ""A...|            2|["@americanair, "...|["@americanair, "...|
+--------------------+-------------+--------------------+--------------------+
only showing top 5 rows



In [12]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericTrain = hashTF.transform(SwRemovedTrain).select(
    'encoded_label', 'MeaningfulWords', 'features')
numericTrain.show(n=3)

+-------------+--------------------+--------------------+
|encoded_label|     MeaningfulWords|            features|
+-------------+--------------------+--------------------+
|            2|["""lol, guys, it...|(262144,[2040,274...|
|            0|[".@americanair, ...|(262144,[767,4806...|
|            0|[".@united, may, ...|(262144,[467,3251...|
+-------------+--------------------+--------------------+
only showing top 3 rows



In [13]:
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Training Model
lr = LogisticRegression(labelCol="encoded_label", featuresCol="features",maxIter=20, regParam=0.01)
model = lr.fit(numericTrain)
print ("Training Done")

2023-12-11 15:12:35,348 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2023-12-11 15:12:35,349 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


Training Done


In [14]:
#Prepare Testing data
tokenizedTest = tokenizer.transform(test)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest)
numericTest.show( n=2)

+--------------------+----------+--------------------+--------------------+--------------------+
|                text|true_label|              tokens|     MeaningfulWords|            features|
+--------------------+----------+--------------------+--------------------+--------------------+
|".@JetBlue ooooka...|         0|[".@jetblue, oooo...|[".@jetblue, oooo...|(262144,[1512,138...|
|".@united It's wo...|         0|[".@united, it's,...|[".@united, worth...|(262144,[41407,51...|
+--------------------+----------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [15]:
#Prediction
raw_prediction = model.transform(numericTest)
raw_prediction.printSchema()

root
 |-- text: string (nullable = true)
 |-- true_label: integer (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- MeaningfulWords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [16]:
Final_prediction = raw_prediction.select("MeaningfulWords", "prediction", "true_label")
Final_prediction.show(n=20)

+--------------------+----------+----------+
|     MeaningfulWords|prediction|true_label|
+--------------------+----------+----------+
|[".@jetblue, oooo...|       1.0|         0|
|[".@united, worth...|       0.0|         0|
|["@americanair, ,...|       0.0|         0|
|["@americanair, -...|       0.0|         0|
|["@americanair, @...|       0.0|         0|
|["@americanair, a...|       0.0|         0|
|["@americanair, a...|       0.0|         0|
|["@americanair, a...|       0.0|         0|
|["@americanair, d...|       0.0|         0|
|["@americanair, h...|       0.0|         1|
|["@americanair, "...|       0.0|         0|
|["@americanair, u...|       0.0|         0|
|["@americanair, o...|       0.0|         0|
|["@americanair, l...|       0.0|         0|
|["@americanair, t...|       0.0|         0|
|["@americanair, h...|       0.0|         0|
|["@americanair, l...|       0.0|         0|
|["@americanair, e...|       0.0|         0|
|["@americanair, f...|       0.0|         0|
|["@americ

In [17]:
Total_True=Final_prediction.filter(Final_prediction['prediction']==Final_prediction['true_label']).count()
Alldata=Final_prediction.count()
Accuracy=Total_True/Alldata
print("Accuracy Score is:", Accuracy*100, '%')

                                                                                

Accuracy Score is: 73.86156648451731 %


In [18]:
# Create a confusion matrix
confusion_matrix = Final_prediction.groupBy("true_label", "prediction").count()
confusion_matrix.orderBy(col("true_label").asc()).show()

[Stage 46:>                                                         (0 + 1) / 1]

+----------+----------+-----+
|true_label|prediction|count|
+----------+----------+-----+
|         0|       2.0|   88|
|         0|       1.0|  270|
|         0|       0.0| 2426|
|         1|       0.0|  393|
|         1|       1.0|  419|
|         1|       2.0|  101|
|         2|       1.0|   96|
|         2|       2.0|  399|
|         2|       0.0|  200|
+----------+----------+-----+



                                                                                

In [None]:
#Real-time prediction:
input_sentence = "I hate this movie"

# Create a DataFrame with a single column named 'text'
data = [(0, input_sentence)]

df = spark.createDataFrame(data, ["id", "text"])

tokenizedTest = tokenizer.transform(df)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest)
raw_prediction_test = model.transform(numericTest)
Final_prediction_test = raw_prediction_test.select("MeaningfulWords", "prediction")
label_mapping_test= { "0.0": "negative", "1.0": "neutral", "2.0": "positive"}

final_pred = Final_prediction_test.withColumn("prediction", 
                          when(col("prediction") == "0.0", label_mapping_test["0.0"])
                          .when(col("prediction") == "1.0", label_mapping_test["1.0"])
                          .when(col("prediction") == "2.0", label_mapping_test["2.0"]))

final_pred.show()

In [None]:
from pywebio.input import *
from pywebio.output import *
from pywebio.session import run_js



def SentimentAnalysis(input_sentence):
    data = [(0, input_sentence)]
    df = spark.createDataFrame(data, ["id", "text"])

    tokenizedTest = tokenizer.transform(df)
    SwRemovedTest = swr.transform(tokenizedTest)
    numericTest = hashTF.transform(SwRemovedTest)
    raw_prediction_test = model.transform(numericTest)
    Final_prediction_test = raw_prediction_test.select("MeaningfulWords", "prediction")
    label_mapping_test= { "0.0": "Negative", "1.0": "Neutral", "2.0": "Positive"}
    final_pred = Final_prediction_test.withColumn("prediction", 
                          when(col("prediction") == "0.0", label_mapping_test["0.0"])
                          .when(col("prediction") == "1.0", label_mapping_test["1.0"])
                          .when(col("prediction") == "2.0", label_mapping_test["2.0"]))
    
    final_value = final_pred.select("prediction").first()[0]

    return final_value
    

def main():
    put_button("ReUpload_images",onclick=lambda: run_js('window.location.reload()'))
    while True:
        input_sentence = input("Please Input Sentence：", type = 'text')
        final_value = SentimentAnalysis(input_sentence)
        if final_value=='Positive':
            put_table([
                ['Your sentence:', input_sentence],
                ['Sentiment analysis result:', final_value]
            ]).style('color: green; font-size: 24px')
        elif final_value == 'Negative':
            put_table([
                ['Your sentence:', input_sentence],
                ['Sentiment analysis result:', final_value]
            ]).style('color: red; font-size: 24px')
        else:
            put_table([
                ['Your sentence:', input_sentence],
                ['Sentiment analysis result:', final_value]
            ]).style('color: gray; font-size: 24px')
        
if __name__ == '__main__':
    main()