The purpose of this notebook is to pre-process all our data, which prevents us from having to import it every time we run a new instance of the main notebook. That way, it will be a lot more time-efficient to code in the main notebook.

In [None]:
%env PYTHONHASHSEED 3
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark

env: PYTHONHASHSEED=3


In [None]:
from math import sqrt
import pyspark
import os
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName('TwitterData') \
    .config('spark.driver.memory', '16g') \
    .config('spark.executor.memory', '16g') \
    .getOrCreate()
from pyspark.sql.functions import input_file_name
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
!mkdir -p /content/drive/MyDrive/test/

Mounted at /content/drive


In [None]:
# The following gets our dataset directly from the download link.
!wget https://snap.stanford.edu/data/twitter.tar.gz
!tar -xvzf twitter.tar.gz

data_path = '/content/twitter/'


#Warning - these imports do take a long time.

# Wrapping all code necessary to generate all RDDs for each different record from the Twitter file.
def getting_files(extension):
  
  # Get a list of all the file paths in the 'twitter' folder for the chosen file extension.
  file_paths = []
  for file_name in os.listdir(data_path):
    if extension in file_name:
      file_paths.append(data_path + file_name)
  
  # Read in all the files as text and add the file path as a column.
  dfs = [spark.read.text(file_path).withColumn('file_path', input_file_name()) for file_path in file_paths]
  
  # Concatenates all the dataframes together.
  df = dfs[0]
  for i in range(1, len(dfs)):
    df = df.union(dfs[i])
  output = df.rdd
  return output

edges_rdd = getting_files("edges")
edges_rdd.saveAsTextFile('/content/drive/MyDrive/twitter_analysis/edges_rdd.txt') # Saves the RDD I just generated to a file, so we can read it into our main notebook.
#Note this is in a binary format, rather than a text format. Since our files are very large - this will save us some time when reading it in.

#feat_rdd = getting_files("feat")
#feat_rdd.saveAsTextFile('/content/drive/MyDrive/twitter_analysis/feat_rdd.txt')

#featnames_rdd = getting_files("featnames")
#featnames_rdd.saveAsTextFile('/content/drive/MyDrive/twitter_analysis/featnames_rdd.txt')

--2023-05-10 03:37:35--  https://snap.stanford.edu/data/twitter.tar.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22339604 (21M) [application/x-gzip]
Saving to: ‘twitter.tar.gz.6’


2023-05-10 03:37:38 (6.53 MB/s) - ‘twitter.tar.gz.6’ saved [22339604/22339604]

twitter/
twitter/21028234.feat
twitter/30031265.circles
twitter/745823.feat
twitter/66804457.featnames
twitter/14372486.feat
twitter/1435461.featnames
twitter/19283723.featnames
twitter/398994309.feat
twitter/11681802.edges
twitter/86560711.featnames
twitter/13747362.egofeat
twitter/143344048.egofeat
twitter/48132655.circles
twitter/266464616.featnames
twitter/31317273.edges
twitter/232706326.featnames
twitter/120459837.egofeat
twitter/54331626.egofeat
twitter/17135931.edges
twitter/317313520.egofeat
twitter/51775432.edges
twitter/121258930.featnames
twitter/258447233.cir

In [None]:
edges_rdd.saveAsTextFile('/content/drive/MyDrive/DATA301project/edges_rdd.txt') # Saves the RDD I just generated to a file, so we can read it into our main notebook.
#Note this is in a binary format, rather than a text format. Since our files are very large - this will save us some time when reading it in.

feat_rdd.saveAsTextFile('/content/drive/MyDrive/DATA301project/feat_rdd.txt')

featnames_rdd.saveAsTextFile('/content/drive/MyDrive/DATA301project/featnames_rdd.txt')

NameError: ignored