# Exploring data

Importing libraries, starting session and reading file

In [1]:
### Importing necessary libraries

import time
from datetime import datetime
import pandas as pd
import numpy as np
import json

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F

import re
from pyspark.sql.types import StringType, TimestampType, FloatType

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
## Startig Spark session
spark = SparkSession.builder \
    .master("local") \
    .appName("Greta") \
    .config("spark.sql.debug.maxToStringFields", 500) \
    .getOrCreate()

## Increasing available memory for Spark
spark.conf.set("spark.sql.legacy.setCommandRejectsSparkCoreConfs","false")
spark.conf.set("spark.executor.memory","4g")
spark.conf.set("spark.driver.memory","4g")
spark.conf.set("spark.driver.maxResultSize","4g")

In [None]:
## Read JSON file into dataframe
df = spark.read.json("hdfs://localhost:9000/ca2/Greta/greta.ndjson")

In [None]:
## Printing schema and showing
df.printSchema()
df.show()

In [None]:
## Counting numer of rows (tweets)
df.count()

In [None]:
## Printing 1st axis columns
for col in df.columns:
    print(col)

For univariate analysis timestamp and text of tweets are extracted.

In [None]:
## Selecting only necessary columns and displaying for review
df.select("created_at","full_text").show(10, truncate=True)

In [None]:
## Displaying full text for additional review
df.select("created_at","full_text").show(1, truncate=False)

# Cleaning data

Fixing timestamps, cleaning text, reducing data for further analysis

In [None]:
## Creating new df from selected columns
df_work = df.select("created_at","full_text")

In [None]:
## Creating function for cleaning texts
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'greta', ' ', text)
    text = re.sub(r'thunberg', ' ', text)
    text = re.sub(r'@[a-zA-Z0-9_]+', ' ', text)   
    text = re.sub(r'https?://[A-Za-z0-9./]+', ' ', text)   
    text = re.sub(r'www.[^ ]+', '', text)  
    text = re.sub(r'[a-zA-Z0-9]*www[a-zA-Z0-9]*com[a-zA-Z0-9]*', ' ', text)  
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = re.sub(' +', ' ',text)
    text = [token for token in text.split() if len(token) > 2]
    text = [token for token in text if token]
    text = ' '.join(text)
    return text

In [None]:
## Creating UDF with function for cleaning text to be applied on the column
cleanUDF = udf(lambda x:clean_text(x),StringType())

In [None]:
## Applying cleanUDF on new column
df_work = df_work.withColumn('Text', cleanUDF(F.col('full_text')))

In [None]:
## Creating function for reshaping timestamp
def createTimestamp(created_at):
    newTimestamp = datetime.strftime(datetime.strptime
                                     (created_at,
                                      '%a %b %d %H:%M:%S +0000 %Y'),
                                    '%Y-%m-%d %H:%M:%S')
    return newTimestamp

In [None]:
## Creating UDF with function for reshaping timestamp to be applied on the column
timestampUDF = udf(lambda x:createTimestamp(x),StringType())

In [None]:
## Applying timestampUDF new on column
df_work = df_work.withColumn('Timestamp', timestampUDF(F.col('created_at')))

In [None]:
## dropping old columns
df_work = df_work.drop('created_at')
df_work = df_work.drop('full_text')

In [None]:
## After cleaning text, remove all rows without alphabetic characters
df_work = df_work.filter(F.col('Text').rlike('[a-zA-Z]'))

In [None]:
## Display dataset for inspection
df_work.show()

Using RDD for creating index column and returning to dataframe

In [None]:
## Converting dataframe to RDD with additional rowID (index) column
from pyspark.sql.types import LongType, StructField, StructType

new_schema = StructType([StructField('rowId',LongType(),True)]
                        + df_work.schema.fields)
zip_rdd = df_work.rdd.zipWithIndex()

In [None]:
## Create map for new RDD
new_rdd = zip_rdd.map(lambda args: ([args[1]+1] + list(args[0])))

In [None]:
## Rewriting df with new data from RDD
df_work = spark.createDataFrame(new_rdd,new_schema)

In [None]:
## Inspecting dataset
df_work.show(5)

In [None]:
## Extract every 10th row because HW doesn't support this number of rows
df_work = df_work.where(df_work.rowId%10==0)

In [None]:
## Getting number of rows
df_work.count()

# Sentiment analysis

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
blob = df_work.select('Text').toPandas()

In [None]:
wc = WordCloud(background_color='white',
                    stopwords =  set(STOPWORDS),
                    max_words = 50, 
                    random_state = 42,)
wc.generate(' '.join(blob['Text']))

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(wc)

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [None]:
def polarity(text):
    text = analyser.polarity_scores(text)['compound']
    return text

In [None]:

polarityUDF = udf(lambda x:polarity(x),FloatType())

In [None]:
df_work = df_work.withColumn('polarity', polarityUDF(F.col('Text')))

In [None]:
df_work.show(5)