## CA 2 - Big Data & Advanced Analytics

### Using Pyspark for data exploration

In [1]:
# clear the cache on the spark session
spark.catalog.clearCache()

In [2]:
# what version of pyspark is running on the computer using SparkContext
sc

In [3]:
# sc master - running locally
sc.master

'local[*]'

In [4]:
# Import regex module
import re
from operator import add

# Import Pyspark
import pyspark
from pyspark.sql import SparkSession

# Other Libraries 
import findspark
findspark.init()

from pyspark.sql.types import StructType, StructField, IntegerType, StringType

from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler

#### Import File from the Hadoop Directory

<b>References</b>
1. https://sparkbyexamples.com/pyspark/pyspark-read-csv-file-into-dataframe/
2. https://medium.com/@ashutoshkumar2048/spark-connect-apache-spark-3-4-9846c40484d0
3. 

In [5]:
# start the spark session for CA2

spark = SparkSession.builder.appName("ca2").getOrCreate()

In [6]:
# Create a structure to hold the data, name and define data types
schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("id", StringType(), True),
    StructField("date", StringType(), True),
    StructField("query", StringType(), True),
    StructField("author", StringType(), True),
    StructField("tweet", StringType(), True)])


In [7]:
# Read input file from hadoop directory on the local drive
# import the csv file from hadoop
path = "/user1/twitter_DS_1yr.csv"

df = spark.read.csv(path, header=False, inferSchema=True, schema=schema)


In [8]:
# Import check - show the first 10 rows of the imported table

#df.show(10)

#### Exploring the dataframe

In [9]:
# Get the number of rows
#num_rows = df.count()

# Get the number of columns
#num_columns = len(df.columns)

# Print the shape
#print("Number of rows: ", num_rows)
#print("Number of columns: ", num_columns)

                                                                                

Number of rows:  1600000
Number of columns:  6


In [10]:
# look at types of values in the polarity

#print(f"There is {df[df['target']==4].count()} positive values in the dataframe.")
#
#print(f"There is {df[df['target']==0].count()} negative values in the dataframe.")

                                                                                

There is 800000 positive values in the dataframe.




There is 800000 negative values in the dataframe.


                                                                                

In [11]:
# drop duplicate entries

df = df.dropDuplicates()

#### Feature Engineering

In [12]:
# import relevant libaries

from pyspark.sql.functions import udf, regexp_replace, lower
from pyspark.sql.types import StringType


In [13]:
# drop any n/a rows

df.dropna()

DataFrame[target: int, id: string, date: string, query: string, author: string, tweet: string]

In [14]:
# remove noise such as html links, stop words / punctuation / #hashtags etc
# REFERENCE: https://medium.com/towards-artificial-intelligence/large-scale-sentiment-analysis-with-pyspark-bdccf9256e35

def pre_process(text):
    # Remove links
    #text = re.sub('http://\S+|https://\S+', '', text)
    #text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r'http\S+', '', text)

    # Convert HTML references
    text = re.sub(r'&amp', 'and', text)
    text = re.sub(r'&lt', '<', text)
    text = re.sub(r'&gt', '>', text)
    #text = re.sub(' ', text)

    # Remove new line characters
    text = re.sub(r'[\r\n]+', ' ', text)
    
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)

    # Remove multiple space characters
    text = re.sub(r'\s+',' ', text)
    
    # Convert all text to lowercase
    text = text.lower()
    
    return text

In [15]:
# Register the pre_process function as a UDF (User-Defined Function)

pre_process_udf = udf(pre_process, StringType())

In [16]:
# Apply the UDF to the 'Tweet_details' column and create a new column 'Processed_tweet_details'

df = df.withColumn('processed_tweet', pre_process_udf('tweet'))

In [18]:
# Show the DataFrame with the new column

#df.show(10)

                                                                                

+------+----------+--------------------+--------+---------------+--------------------+--------------------+
|target|        id|                date|   query|         author|               tweet|     processed_tweet|
+------+----------+--------------------+--------+---------------+--------------------+--------------------+
|     0|1468106999|Mon Apr 06 23:45:...|NO_QUERY|    stuartheron|@watko Shockingly...|    shockingly not! |
|     0|1686953810|Sun May 03 07:19:...|NO_QUERY|forevaguitargrl|up ad ready for c...|up ad ready for c...|
|     0|1956161267|Thu May 28 21:18:...|NO_QUERY|      SupaSista|Someone just sent...|someone just sent...|
|     0|1956910895|Thu May 28 23:00:...|NO_QUERY| jeanettiewuvsu|Wtf is wrong with...|wtf is wrong with...|
|     0|1963373312|Fri May 29 12:37:...|NO_QUERY|        XkyRauh|Only two weeks le...|only two weeks le...|
|     0|1972460907|Sat May 30 09:16:...|NO_QUERY|         jiaaaa|You gave me false...|you gave me false...|
|     0|1973029187|Sat May 3

#### Feature Extraction

<b>References</b>
1. https://medium.com/towards-artificial-intelligence/large-scale-sentiment-analysis-with-pyspark-bdccf9256e35

2. https://medium.com/@chris_42047/an-easy-guide-to-basic-twitter-sentiment-analysis-python-tutorial-1630d5213ff6

3. https://www.kaggle.com/code/muhammetzahitaydn/pyspark-sentiment-analysis-with-word2vec-embedding


In [19]:
# Import the relevant libraries tocreate a pipleline

from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer, CountVectorizer, NGram, VectorAssembler, ChiSqSelector



In [20]:
# create a tokenizer 

tokenizer = Tokenizer(inputCol="processed_tweet", outputCol="words")

In [21]:
# HashingTF: Hashing Term Frequency
# REFERENCE - https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.HashingTF.html#pyspark.ml.feature.HashingTF
# Maps a sequence of terms to their term frequencies using the hashing

hashtf = HashingTF(inputCol="words", outputCol='tf')

In [22]:
# pass the hashtf function to the IDF function
# REFERENCE - https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.IDF.html#pyspark.ml.feature.IDF
# Compute the Inverse Document Frequency (IDF) given a collection of documents ie the tweets df

idf = IDF(inputCol='tf', outputCol="features")

In [23]:
# Index labels

label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")

In [24]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()

pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx, lr])

#### Filter data based on key word - 'weather'

##### REFERENCES 
1. https://towardsdatascience.com/sentiment-analysis-with-pyspark-bc8e83f80c35
2. https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524

In [25]:
# create a variable to search within the dataframe on

search_for = "weather"


In [26]:
df2 = df.filter(df["processed_tweet"].contains(search_for))

In [27]:
#df2.show(10)



+------+----------+--------------------+--------+---------------+--------------------+--------------------+
|target|        id|                date|   query|         author|               tweet|     processed_tweet|
+------+----------+--------------------+--------+---------------+--------------------+--------------------+
|     4|1964587188|Fri May 29 14:28:...|NO_QUERY|       SuperRob|@sparktography We...| weather.com says...|
|     0|2264778973|Sun Jun 21 04:55:...|NO_QUERY|    deirdre7194|@Es94 yay!! i did...| yay!! i didnt no...|
|     4|2060752623|Sat Jun 06 19:30:...|NO_QUERY|     HailHorror|Finishing my pack...|finishing my pack...|
|     0|2229093418|Thu Jun 18 15:16:...|NO_QUERY| nocheapthrillz|@afreshmusic hey ...| hey d! my bad, i...|
|     0|2191448262|Tue Jun 16 05:17:...|NO_QUERY|     tippielove|PS for those intr...|ps for those intr...|
|     0|2222121192|Thu Jun 18 06:31:...|NO_QUERY|  abitheamazing|so its nice weath...|so its nice weath...|
|     0|1992186304|Mon Jun 0

                                                                                

In [28]:
# Get the number of rows in the new df#
#num_rows = df2.count()

# Get the number of columns
#num_columns = len(df2.columns)



                                                                                

In [29]:
# Print the shape
#print("Number of rows: ", num_rows)
#print("Number of columns: ", num_columns)

Number of rows:  10342
Number of columns:  7


### **************************************************************



#### Sentiment extraction using TextBlob

##### REFERENCES

1. https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524
2. https://towardsdatascience.com/sentiment-analysis-with-pyspark-bc8e83f80c35
3. https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524

In [30]:
from pyspark.sql import SparkSession
from textblob import TextBlob


In [31]:
# Define a UDF to apply sentiment analysis using TextBlob

def get_sentiment(processed_tweet):
    analysis = TextBlob(processed_tweet)
    
    sentiment = analysis.sentiment.polarity

    return sentiment



In [32]:

# Register the UDF
get_sentiment_udf = spark.udf.register("get_sentiment", get_sentiment)

In [33]:
# Apply the UDF to the 'tweet' column and create a new column 'sentiment'
df = df.withColumn('sentiment', get_sentiment_udf('processed_tweet'))


In [34]:

# Show the DataFrame with the 'tweet' and 'sentiment' columns
df.select('processed_tweet', 'sentiment').show(truncate=False)

[Stage 16:>                                                         (0 + 1) / 2]

KeyboardInterrupt: 

#### Save Weather Data to a new Dataframe

In [None]:
# REFERENCE - https://sparkbyexamples.com/pyspark/pyspark-write-dataframe-to-csv-file/

#rename the dataframe

weather_tweets2 = df2

In [None]:
# save the file to a csv file

#weather_tweets.write.csv("hdfs://localhost:9000/user1/weather_tweets2")

# commented out as file already exists on Hadop

### Time Series

##### References

1. https://towardsdatascience.com/end-to-end-time-series-interpolation-in-pyspark-filling-the-gap-5ccefc6b7fc9
2. https://medium.com/@y.s.yoon/scalable-time-series-forecasting-in-spark-prophet-cnn-lstm-and-sarima-a5306153711e
3. https://medium.com/delaware-pro/interpolate-big-data-time-series-in-native-pyspark-d270d4b592a1
4. https://www.fast.ai/    and   https://github.com/fastai
6. https://docs.fast.ai/tutorial.text.html

****
Fastai is a deep learning library that was used to complete the analsysis.  This was the selected algorithm as it can be used with pyspark.  Finally the library can work with tabular and text data so can be used to complete a time series and a sentiment analysis. 

In [35]:
# Import Libraries 

import sys
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fastai.tabular import *
import six

from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType, IntegerType, FloatType

from pyspark.ml.regression import LinearRegression



In [36]:
# put the data in chronological order by date

weather_tweets2 = weather_tweets2.orderBy('date')

NameError: name 'weather_tweets2' is not defined

In [None]:
# convert weather_tweets 2 from a spark df to a Pandas df for processing
pdf = weather_tweets2.toPandas()

In [None]:
# display the resulting pandas dataframe

pdf.info()



#### Preparing for the ML Algorithm

In [None]:
# Prepare to use fastai library to complete the analysis 
# Step 1 - defining variables fpr the analysis
# REFERNCE - https://docs.fast.ai/quick_start.html


target_var = 'processed_tweet'
categorical_vars = ['target', 'author']
date_var = ['date']

In [None]:
# a list of preprocessing operations used on the data for training
# imported as part of fastai

procs = [Categorify, FillMissing, Normalize]

In [None]:
# Split data randomly, adjust according to your needs
splits = RandomSplitter(valid_pct=0.2)(range_of(pdf)) 

In [None]:
# Step 2 - build the model

to = TabularPandas(pdf, procs, cat_names, cont_names, y_names=dep_var, splits=splits)


In [None]:
# 

# Choose a batch size that fits your memory

dls = to.dataloaders(bs=64)  

In [None]:
learn = tabular_learner(dls, layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(1)