In [1]:
import pandas as pd
import numpy as np
import json
import os
import multiprocessing as mp
from time import time
import socket
from pathlib import Path
import re
import unicodedata
import sys

import warnings
warnings.filterwarnings('ignore')

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from py4j.java_gateway import java_import
from functools import reduce
from pyspark.sql import DataFrame
from pyspark import SparkContext

# 0. Init

In [3]:
memory = '10g'
pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [4]:
try:
    spark
except NameError:
    print('Create Local SparkSession')
    spark=SparkSession.builder.config("spark.driver.host", "localhost").appName("extract-timelines").getOrCreate()
    
# IgnoreCorruptFiles
spark.conf.set("spark.sql.files.ignoreCorruptFiles", "true")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

sc = spark.sparkContext

Create Local SparkSession


In [5]:
# Paths to data
path_to_data = "../data/"
path_to_external_data = os.path.join(path_to_data, "external-data/")
path_to_parquets = os.path.join(path_to_data,'chunks','IDF-departments-to-analyze')
parquet_files = sorted([x for x in Path(path_to_parquets).glob("**/*.parquet")])

In [6]:
print('List files to be processed...')

fs=spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
list_status=fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(path_to_parquets))

paths=[file.getPath().toString() for file in list_status]
np.random.seed(0)
paths=np.random.permutation(sorted(parquet_files))

print('# Files:', len(paths))

List files to be processed...
# Files: 16


In [7]:
tweets=spark.read.option("encoding", "UTF-8").parquet(os.path.join(path_to_data,'chunks','IDF-departments-to-analyze'))

In [8]:
print("Number of tweets : %d" % tweets.count())
print("Number of unique users : %d" % tweets.select('user_id').distinct().count())

Number of tweets : 29647175
Number of unique users : 30651


# 1. Data cleaning

In [9]:
# UNIDECODE : remove accents
def make_trans():
    matching_string = ""
    replace_string = ""

    for i in range(ord(" "), sys.maxunicode):
        name = unicodedata.name(chr(i), "")
        if "WITH" in name:
            try:
                base = unicodedata.lookup(name.split(" WITH")[0])
                matching_string += chr(i)
                replace_string += base
            except KeyError:
                pass

    return matching_string, replace_string

def clean_text(c):
    matching_string, replace_string = make_trans()
    return F.translate(
        F.regexp_replace(c, "\p{M}", ""), 
        matching_string, replace_string
    ).alias(c)

In [10]:
def clean_dataset(df):
    
    df=df.select(
            'user_id',
            F.date_format(F.col('created_at'),"yyyy-MM-dd").alias('day').cast("date"),
            F.lower(F.col('full_text')).alias('full_text'),
            'lang',
            'Department',
            #'Code department'
            )
    
    df = df.repartition(160)

    # remove rt
    df = df.filter(~ df.full_text.startswith('rt'))
   
    # remove user ids and urls
    df = df.withColumn('full_text', F.regexp_replace('full_text', r'@[A-Za-z0-9-_]+','@mention'))
    df = df.withColumn('full_text', F.regexp_replace('full_text', 'https?://[A-Za-z0-9./]+','<url>'))
    
    # language : french
    df = df.filter(df.lang=='fr')
    
    return df

In [11]:
tweets = clean_dataset(tweets)
tweets = tweets.select('user_id', 'day', clean_text('full_text'), 'lang', 'Department')
#tweets = tweets.select('user_id', 'day', clean_text('full_text'), 'lang', 'Department', 'Code department')

In [12]:
# Cleaning: remove @mention [url] # and punctuation-smileys
#tweets = tweets.withColumn('full_text', F.regexp_replace('full_text','@mention', ''))
#tweets = tweets.withColumn('full_text', F.regexp_replace('full_text', '<url>', ''))
#tweets = tweets.withColumn('full_text', F.regexp_replace('full_text', '#', ''))
tweets = tweets.withColumn('full_text', F.regexp_replace('full_text', '[^\sa-zA-Z0-9@]', ' '))

# Remove heading and trailing whitespace
tweets = tweets.withColumn('full_text', F.ltrim(tweets.full_text))
tweets = tweets.withColumn('full_text', F.rtrim(tweets.full_text))

# 2. Symptoms analysis

In [13]:
tweets = tweets.withColumn('covid', F.col('full_text').rlike('covid|corona |coronavirus'))
tweets = tweets.withColumn('confinement', F.col('full_text').rlike('confin|quarantaine'))
tweets = tweets.withColumn('RestezChezVous', F.col('full_text').rlike('je reste chez moi|jerestechezmoi|restezchezvous|restez chez vous'))

In [14]:
print("Number of tweets mentioning COVID : %d" % tweets.filter(tweets.covid==1).count())
print("Number of tweets mentioning lockdown/quarantine : %d" % tweets.filter(tweets.confinement==1).count())

Number of tweets mentioning COVID : 150280
Number of tweets mentioning lockdown/quarantine : 158260


In [15]:
symptoms_dict_fr = {'cough' : ['toux', 'tousse'],
                   'sore_throat' : ['maux de gorge', 'mal de gorge', 'mal a la gorge'],
                   'fever' : ['fievre', 'de la temperature'],
                    #'mal de tête' : ['mal de tête','mal de crâne','mal à la tête','mal de tete','mal de crane','mal à la tete'],
                   'loss_taste' : ['perte du gout', "perte de lodorat","perte de l odorat","perdu l odorat","perdu lodorat",
                                   "perdu le gout","plus de gout","plus dodeur","plus d odeur"],
                   'skin_symptom' : ['engelures'],
                   'symptoms' : ['symptome'],
                   'breathing_difficulties' : ['difficultes a respirer', 'difficultes respiratoires', 'difficulte a respirer',
                                               'mal a respirer']
                   #'hospitalisation' : ['hôpital','hopital','hospital','réanim','reanim']
                   }

for symptom in symptoms_dict_fr.keys():
    tweets = tweets.withColumn(symptom, F.col('full_text').rlike(('|').join(symptoms_dict_fr.get(symptom))))
tweets = tweets.withColumn('nb_symptoms', sum(tweets[c].cast('long') for c in list(symptoms_dict_fr.keys())))
tweets = tweets.withColumn('contains_symptom', F.col('nb_symptoms')>=1)

In [16]:
print("Number of tweets mentionning symptoms : %d" % tweets.filter(tweets.contains_symptom==1).count())

Number of tweets mentionning symptoms : 8506


In [17]:
tweets = tweets.withColumn('full_text2', F.regexp_replace('full_text','@mention', ''))

In [18]:
# NEW FILTERS : if contains a pronum (marker of lived experience), or tweet startswith a symptom, 
# AND tweet does not contain hashtag

# Indicators of feeling
pronums = ['g','j a','j ai','jai','m a','m ai','je','me', 'mes', 'l a', 'l ai','mon','ma','son','sa','jsui','j sui','j suis','jtousse']

tweets = tweets.withColumn('pronum', 
                           (F.col('full_text').rlike(' |'.join(['^'+s for s in pronums]))) |
                          (F.col('full_text').rlike((' | ').join(pronums))))

# Startswith symptom
tweets = tweets.withColumn('full_text2', F.regexp_replace('full_text','@mention', ''))
tweets = tweets.withColumn('full_text2', F.ltrim(tweets.full_text2)) \
                .withColumn('start_symptom', (F.col('full_text2').rlike('|'.join(['^'+s for s in list(symptoms_dict_fr.keys())]))))

# Hashtags
tweets = tweets.withColumn('hashtag', F.col('full_text').contains('#'))

# Classif
tweets = tweets.withColumn('has_symptom', ((F.col('pronum')==1) | (F.col('start_symptom')==1)) & 
                           (F.col('hashtag')==0) & (F.col('contains_symptom')==1))

In [19]:
print("Number of tweets of people having symptoms : %d" % tweets.filter(tweets.has_symptom==1).count())

Number of tweets of people having symptoms : 3941


In [20]:
tweets.filter(tweets.has_symptom==1).select('full_text').show(n=20, truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|full_text                                                                                                                                                                                                                                                             |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|jarrete pas de tousser ca m empeche de dormir et j ai trop peur de de reveiller ma mere                                                                                                                     