The purpose of this notebook is to pre-process all our data, which prevents us from having to import it every time we run a new instance of the main notebook. That way, it will be a lot more time-efficient to code in the main notebook.

In [None]:
%env PYTHONHASHSEED 3
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark

env: PYTHONHASHSEED=3
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
def remove_excess(list_of_str):
  problem_chars = "':!.\,/"
  for i in range(0, len(list_of_str)):
    list_of_str[i] = list_of_str[i].rstrip(problem_chars)
  return set(list_of_str)

In [None]:
from math import sqrt
import pyspark
import os
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession


spark = SparkSession.builder.master("local[*]").appName('Twitter Analysis').config(
    "spark.executor.memory", "1g").config("spark.ui.port", "4050"
        ).getOrCreate()
sc = spark.sparkContext

from google.colab import drive
drive.mount("/content/drive")

influential_users = ['15924858', '14719129', '53235381', '11928542', '215824411', '90880254', '18996905', '262310943', '100318079', '24542441']

feats = sc.textFile('/content/drive/My Drive/twitter_analysis/featnames_rdd.txt')
feats = feats.map(lambda x: (x.split(", ")[1].split('r/')[1].split('.')[0], x.split(", ")[0].split(" ")[1])).groupByKey().filter(lambda x: x[0] in influential_users).mapValues(list).map(lambda x: (x[0], remove_excess(x[1])))

Mounted at /content/drive


In [None]:
import math

def length(vector):
    sum_sq = 0
    for item in vector:
      sum_sq += int(item)*int(item)
    return math.sqrt(sum_sq)

def cosine_similarity(u, v):
      dotprod = sum(int(i[0]) * int(i[1]) for i in zip(u, v))
      return dotprod/(length(u) * length(v))


In [None]:
#feats.count()
#feats = feats.map(lambda x: (x.split(",")[1], x.split(",")[0]).take(10)
#feats#.map(lambda x: (x.file_path.split('r/')[1].split('.')[0], x.value.split()[1])).groupByKey().mapValues(list)

In [None]:
feats.take(10)

[('333881828',
  {'#1',
   '#FF',
   '#RT',
   '@AlyMew',
   '@B_Loner',
   '@ChillTreehugger',
   '@ClubbyDubby',
   '@DanielHyon',
   '@DavidVonderhaar',
   '@EMPTECLIP5E',
   '@EatMyDiction1',
   '@FuzzyOtterBalls',
   '@FxyMxy',
   '@GoldGloveTV',
   '@IMortaLTechNyQ',
   '@KittyRavage',
   '@Klutch7',
   '@LethalFrag',
   '@Mal_Mal2011',
   '@Meatholl',
   '@MediocreMadame',
   '@MistaCheapSkate',
   '@MousePena',
   '@MrAran551',
   '@MrErnestLe',
   '@MrsExile',
   '@MsArtemis01',
   '@NasaFromNYC',
   '@OhShit_ItsBerlo',
   '@Peopleschamp330',
   '@Pewdie',
   '@Phantom_Legend',
   '@Puddock3000',
   '@Pwnedbyagiirl',
   '@QuakeBeatZ',
   '@Sarcasmx',
   '@Schoolyy',
   '@ShiftHappensBro',
   '@Shino775',
   '@Siggyv',
   '@Sinumatic',
   '@SoCalMeg',
   '@SwagChica',
   '@TheKingNappy',
   '@The_SilviaKid',
   '@Tiffababyy',
   '@TmarTn',
   '@UnlawfulExile',
   '@VegasJamie',
   '@WGUnite',
   '@Yeousch',
   '@YouTube',
   '@Zerosion',
   '@_TwitTwit',
   '@aWhitebrownie',
  

In [None]:
user_pairs_rdd = feats.cartesian(feats).filter(lambda x: x[0][0] < x[1][0])
similarities_rdd = user_pairs_rdd.map(lambda x: (x[0][0], x[1][0], cosine_similarity(x[0][1], x[1][1])))

In [None]:
user_pairs_rdd.take(5)