In [1]:
%%configure -f
{
    "conf": {
        "spark.pyspark.python": "python3",
        "spark.pyspark.virtualenv.enabled": "true",
        "spark.pyspark.virtualenv.type":"native",
        "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv"
    }
}

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover, CountVectorizer, IDF, SQLTransformer, Tokenizer
from pyspark.ml.clustering import LDA
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.ml.feature import SQLTransformer
import boto3
import mysql.connector

# Get Spark context
spark = SparkSession.builder.getOrCreate()

def run_topic_model(df):
    '''
    Generate topic models for a dataframe.
    '''
    # Concatenate title and body:
    concatenation_expr = "SELECT *, CONCAT(title, body) AS content FROM __THIS__"
    sql_transformer = SQLTransformer(statement=concatenation_expr)
    c_df = sql_transformer.transform(df)

    # Tokenize:
    tokenizer = Tokenizer(inputCol='content', outputCol='tokens')
    tokenized_df = tokenizer.transform(c_df)

    # Remove stop words:
    stopwords = StopWordsRemover.loadDefaultStopWords("english")
    additional_stopwords = ['*', '-', '@', '&amp', '&gt', '-', '•', '/', '–', '&amp;', '&gt;']
    stopwords += additional_stopwords
    stopword_remover = StopWordsRemover(inputCol='tokens', outputCol='filtered_tokens', stopWords=stopwords)
    filtered_df = stopword_remover.transform(tokenized_df)

    # TF-IDF:
    cv = CountVectorizer(inputCol='filtered_tokens', outputCol='raw_features')
    cv_model = cv.fit(filtered_df)
    featurized_df = cv_model.transform(filtered_df)
    idf = IDF(inputCol='raw_features', outputCol='features')
    idf_model = idf.fit(featurized_df)
    tfidf_df = idf_model.transform(featurized_df)

    # LDA model:
    num_topics = 10
    lda = LDA(k=num_topics, maxIter=10)
    lda_model = lda.fit(tfidf_df)

    # Get topic distribution:
    transformed_df = lda_model.transform(tfidf_df)

    # Get biggest topics:
    def get_most_dominant_topic(topics):
        max_index = int(topics.argmax())
        return str(max_index)
    udf_get_most_dominant_topic = udf(get_most_dominant_topic, StringType())
    transformed_df = transformed_df.withColumn('dominant_topic', udf_get_most_dominant_topic('topicDistribution'))
    topic_indices = range(num_topics)
    word_indices = cv_model.vocabulary
    topic_words = lda_model.describeTopics(maxTermsPerTopic=10)
    topics = topic_words.rdd.map(lambda row: row['termIndices']).collect()
    topic_word_list = []
    for topic in topics:
        words = [word_indices[i] for i in topic]
        topic_word_list.append(words)

    # Print each topic:
    for topic_idx, words in enumerate(topic_word_list):
        print(f"Topic {topic_idx}: {', '.join(words[1:])}")
    return
        
        
def compare_diversities(posts_df, diversity_scores_df, scale):
    '''
    Split data along diversity scale and generate two topic models.
    '''
    
    assert type(scale) == str

    # Get 2 dfs to compare:
    # Posts that rank the best on that axis
    lowest_scale_df = posts_df.join(diversity_scores_df, 'reddit_handle')\
        .filter(diversity_scores_df[scale] < 100)
    print(f'Topics for cities with the best rank in {scale}:')
    run_topic_model(lowest_scale_df)
    # Posts that rank the worst on that axis:
    highest_scale_df = posts_df.join(diversity_scores_df, 'reddit_handle')\
        .filter(diversity_scores_df[scale] > 400)
    print(f'\nTopics for cities with the worst rank in {scale}:')
    run_topic_model(highest_scale_df)

    return

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9,application_1685038600819_0013,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Fetch data:
rds_name = 'rds_project'
# Gather Reddit handles:
rds = boto3.client('rds', region_name='us-east-1')
db = rds.describe_db_instances()['DBInstances'][0]
ENDPOINT = db['Endpoint']['Address']
PORT = db['Endpoint']['Port']
conn =  mysql.connector.connect(host=ENDPOINT,
                                user="username",
                                passwd="password", 
                                port=PORT, 
                                database=rds_name)
cur = conn.cursor()

query = f"""
    SELECT column_name
    FROM information_schema.columns
    WHERE table_schema='{rds_name}'
    AND table_name='diversity_scores';
"""
cur.execute(query)
column_names = [i[0] for i in cur.fetchall()]
query = """
    SELECT * FROM diversity_scores
"""
cur.execute(query)
diversity_scores_df = spark.createDataFrame(cur.fetchall(), column_names)

query = f"""
    SELECT column_name
    FROM information_schema.columns
    WHERE table_schema='{rds_name}'
    AND table_name='posts';
"""
cur.execute(query)
column_names = [i[0] for i in cur.fetchall()]
query = """
    SELECT * FROM posts;
"""
cur.execute(query)
posts_df = spark.createDataFrame(cur.fetchall(), column_names)
conn.close()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
compare_diversities(posts_df, diversity_scores_df, 'overall_rank')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Topics for cities with the best rank in overall_rank:
Topic 0: anyone, like, know, get, looking, county, people, plant, car
Topic 1: **, pm, unemployment, best, new, &amp;#x200b;, \-, raleigh, day
Topic 2: hair, anyone, looking, good, clubs, vegan, know, get, new
Topic 3: plano,  , rockville, atlanta?, meet, goods, comments, report, links
Topic 4: , boston, cop, pizza, cinco, best, mayo, passes, local
Topic 5: vote, trash, voting, ballot, get, drop, like, around, know
Topic 6: like, looking, know, anyone, get, i’m, new, good, one
Topic 7: thai, georgia, o, \-, donuts, study, nba, anyone, beef
Topic 8: pm, ages, library), film, (riverfront, taxi, presidential, square, ride
Topic 9: vet, show, grand, comedy, know, ice, anyone, tempe, get

Topics for cities with the worst rank in overall_rank:
Topic 0: city, looking, council, anyone, know, like, good, place, places
Topic 1: gary, power, storm, guide, wyoming, king, rock, :, new
Topic 2: star, dead, de, wars, , episode, rainbow, purple, de

In [5]:
compare_diversities(posts_df, diversity_scores_df, 'socioeconomic_diversity')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Topics for cities with the best rank in socioeconomic_diversity:
Topic 0: know, anyone, dog, looking, new, like, good, best, one
Topic 1: golf, \-, speed, police, air, city, router, new, &amp;#x200b;
Topic 2: , massage, election, place, curly, gyms, therapist, presidential, salon
Topic 3: talk, friday,, franklin, warrant, covid-19, traffic,, vent, members, free
Topic 4: city, community, county, looking, may, property, housing, know, get
Topic 5: killed, police, boxing, dog, man, arrested, new, anyone, olathe
Topic 6: like, anyone, get, looking, know, i’m, good, one, people
Topic 7: [click, looking, food, meet, good, get, tattoo, new, best
Topic 8: boston, daily, best, discord, dearborn, chat, questions, things, fox
Topic 9: need, new, looking, weekly, anyone, help, rogers, get, find

Topics for cities with the worst rank in socioeconomic_diversity:
Topic 0: p.m., macon, --, school, art, &amp;#x200b;, west, beer, water
Topic 1: city, voting, ballot, need, vote, anyone, good, know, looki

In [10]:
compare_diversities(posts_df, diversity_scores_df, 'cultural_diversity')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Topics for cities with the best rank in cultural_diversity:
Topic 0: , anyone, open, taxi, war, new, commission, venue, water
Topic 1: like, know, looking, anyone, get, i’m, good, people, one
Topic 2: place, plano, bellevue, new, [click, pm, good, city, reporter
Topic 3: comedy, police, dental, suspect, mayor, show, yoga, crystal, officers
Topic 4: new, looking, anyone, pizza, get, i’m, gym, know, rent
Topic 5: fireworks, \-, discord, amtrak, ri, armed, wood, tea, like
Topic 6: springfield, lake, [springfield, ^the, ^to, events,, events, know, view
Topic 7: ages, library), roller, cake, kent, pm, (riverfront, coronavirus, county
Topic 8: best, fresh, food, place, chicken, daily, good, valley, sushi
Topic 9: new, city, voting, vote, anyone, county, like, get, may

Topics for cities with the worst rank in cultural_diversity:
Topic 0: looking, know, like, anyone, help, good, find, dog, please
Topic 1: fireworks, mississippi, christmas, judge, cases, #, holiday, among, proposed
Topic 2: , 

In [11]:
compare_diversities(posts_df, diversity_scores_df, 'economic_diversity')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Topics for cities with the best rank in economic_diversity:
Topic 0: school, waldorf, like, anyone, looking, new, board, play, good
Topic 1: fireworks, dining, waldorf, newark, parade, southwest, carson, looking, lake
Topic 2: conflict, , survival, chicken, art, pm, peace, glen, challenge
Topic 3: like, know, looking, anyone, get, i’m, good, people, one
Topic 4: miami, get, city, like, looking, league, laredo, new, de
Topic 5: cat, grand, pool, power, anyone, know, get, looking, man
Topic 6: unemployment, rate, figures, employment, labor, individual, positions, dental, force
Topic 7: police, radio, clarksville, neighbors, photo, morning, trivia, shots, tn**
Topic 8: kent, ca, tank, here’s, beaches, y, anyone, jackson, laid
Topic 9:  , eledge, donate, women’s, hair, recommendations?, looking, smash, ms.

Topics for cities with the worst rank in economic_diversity:
Topic 0: anyone, know, around, like, brunch, louis, day, find, fire
Topic 1: jazz, oil, river, anyone, nail, [best, lenexa, 

In [12]:
compare_diversities(posts_df, diversity_scores_df, 'household_diversity')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Topics for cities with the best rank in household_diversity:
Topic 0: looking, questions, anyone, good, apartments, know, like, grand, open
Topic 1: a/c, cases, antonio, moon, eastern, san, anyone, submissions, know
Topic 2: pizza, ?, man, police, know, pa, dental, game, city
Topic 3: **, anyone, know, **sat, join, ne, new, pm, food
Topic 4: get, like, looking, know, anyone, people, i’m, new, want
Topic 5: hair, shelter, ice, paterson, fence, anyone, stylist, snow, ski
Topic 6: anyone, know, best, like, around, bar, coffee, get, day
Topic 7: unemployment, anyone, looking, style, get, know, good, racine, new
Topic 8: , best, food, &amp;#x200b;, arrested, chicken, cuban, justice, sunset
Topic 9: des, salon, station, dance, moines, dancing, nail, best, trash

Topics for cities with the worst rank in household_diversity:
Topic 0: hair, unemployment, bars, good, new, looking, anyone, play, sc
Topic 1: council, city, need, weekly, new, park, tickets, help, date
Topic 2: looking, like, get, a

In [13]:
compare_diversities(posts_df, diversity_scores_df, 'religious_diversity')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Topics for cities with the best rank in religious_diversity:
Topic 0: looking, anyone, plymouth, raleigh, new, good, ice, music, play
Topic 1: events, montgomery, comments, voting, sure, please, upcoming, meet, area
Topic 2: like, know, anyone, get, looking, people, i’m, one, go
Topic 3: vote, oklahoma, coeur, gay, d'alene, register, nic, anyone, brick
Topic 4: trash, movie, open, get, greenway, clubs, black, market, i’m
Topic 5: looking, like, get, know, anyone, good, place, city, i’m
Topic 6: , mobile, rivers, sc, [my, barber, curly, recommendations, stylist
Topic 7: weekly, sioux, thread, [click, falls, like, want, talk, make
Topic 8: unemployment, rate, figures, new, bus, individual, labor, positions, workers
Topic 9: anyone, taxi, cleaning, housing, woman, ,, car, property, weekly

Topics for cities with the worst rank in religious_diversity:
Topic 0: looking, good, know, anyone, places, get, best, place, like
Topic 1: storm, propane, abortion, prison, need, clinic, dentist, i’m, 