In [None]:
# Extracting the information from the JSON file into a CSV
# Author of code: Samsung Lim
# Date: 17th October 2021
# Modified by Kathleen Shalini Rome

import pandas as pd

# all
df_all = pd.read_json('tweets_melb_earthquake_test.json', encoding='utf-8', lines=True)

df_all.to_csv('Melb__all.csv', encoding='utf-8')

# Extracting column data into CSV files.
# user
user = pd.json_normalize(df_all['user'])
user.to_csv('Melb__user.csv', encoding='utf-8')

#entities
entities = pd.json_normalize(df_all['entities'])
entities.to_csv('Melb__entities.csv', encoding='utf-8')


In [9]:
# To deal with the Type error - argument of type ;float; is not iterable for extended tweet 
# Source: https://coderedirect.com/questions/166716/how-to-json-normalize-a-column-with-nans
def flatten_json(nested_json, exclude=['']):
    """Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
            exclude: Keys to exclude from output.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

extended_tweets = pd.DataFrame([flatten_json(x) for x in df_all['extended_tweet']])

extended_tweets.to_csv('Melb__extended_tweet.csv', encoding='utf-8')


In [11]:
extended_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107286 entries, 0 to 107285
Columns: 733 entries,  to extended_entities_media_3_description
dtypes: float64(373), object(360)
memory usage: 600.0+ MB


# Merge Columns

In [14]:
df_col_merged = pd.concat([df_all, user, entities, extended_tweets], axis=1)

# df_col_merged.to_csv('Seroja__col_merged.csv', encoding='utf-8')
 # Can improve it by adding user.id etc to make it more specific where it came from. 
#df_col_merged.to_csv('Seroja__2_col_merged.csv', encoding='utf-8')
df_col_merged.to_csv('Melb__col_merged.csv', encoding='utf-8') 

# I had to manually delete the extra lang col from the user col


In [18]:
df_col_merged = pd.read_csv('Melb__col_merged.csv')

  interactivity=interactivity, compiler=compiler, result=result)


# Delete duplicate Tweets and tweets that dont have 'en' (english) specified in their lang field. 

In [32]:
# Remove duplicates incase multiple retweets of the same retweet:
df_col_merged.drop_duplicates(subset='text',inplace=True)
# Removing non english tweets
# Source: https://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value
df_col_merged.drop(df_col_merged.loc[df_col_merged['lang']!='en'].index, inplace=True)

In [31]:
df_col_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62851 entries, 0 to 107284
Columns: 815 entries, Unnamed: 0 to extended_entities_media_3_description
dtypes: bool(13), float64(464), int64(10), object(328)
memory usage: 385.8+ MB


In [33]:
df_col_merged.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 41767 entries, 1 to 107284
Columns: 815 entries, Unnamed: 0 to extended_entities_media_3_description
dtypes: bool(13), float64(464), int64(10), object(328)
memory usage: 256.4+ MB


# Extract only the key featured columns required for analysis

In [34]:
cols_2 = ['created_at','id','text','source','name', 'screen_name','location', 'description', 'followers_count', 'friends_count', 'geo_enabled', 'time_zone','geo','coordinates','place','full_text', 'timestamp_ms','lang']

# Just considering 'entities' is useful to have them extracted from the text already. So might want to have those as well. 

df_filtered = df_col_merged[cols_2]
# Saving the reduced dataframe to a csv
df_filtered.to_csv('Melb_filtered_cols.csv', encoding = 'utf-8')

In [35]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41767 entries, 1 to 107284
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   created_at       41767 non-null  object 
 1   id               41767 non-null  float64
 2   text             41767 non-null  object 
 3   source           41767 non-null  object 
 4   name             41767 non-null  object 
 5   screen_name      41767 non-null  object 
 6   location         28963 non-null  object 
 7   description      37563 non-null  object 
 8   followers_count  41767 non-null  int64  
 9   friends_count    41767 non-null  int64  
 10  geo_enabled      41767 non-null  bool   
 11  time_zone        0 non-null      float64
 12  geo              2089 non-null   object 
 13  coordinates      2089 non-null   object 
 14  place            2383 non-null   object 
 15  full_text        14183 non-null  object 
 16  timestamp_ms     41767 non-null  object 
 17  lang       

# Preprocessing Text
Removing urls, @ mentions and Retweets, hashtags, audio and video tags, double space, strip punctuation, remove numbers???
Tokenise
Lemamtize

In [25]:
# Source: https://github.com/bicachu/topic-modeling-health-tweets/blob/master/notebooks/clean_tokenizer.py

import pandas as pd
import re
import gensim
from nltk.stem import WordNetLemmatizer

punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'         # define a string of punctuation symbols

# Functions to clean tweets
def remove_links(tweet):
    """Takes a string and removes web links from it"""
    tweet = re.sub(r'http\S+', '', str(tweet))   # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', str(tweet)) # remove bitly links
    tweet = tweet.strip('[link]')   # remove [links]
    tweet = re.sub(r'pic.twitter\S+','', str(tweet))
    return tweet

def remove_users(tweet):
    """Takes a string and removes retweet and @user information"""
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', str(tweet))  # remove re-tweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', str(tweet))  # remove tweeted at
    return tweet

def remove_hashtags(tweet):
    """Takes a string and removes any hash tags"""
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', str(tweet))  # remove hash tags
    return tweet

def remove_av(tweet):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    tweet = re.sub('VIDEO:', '', str(tweet))  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', str(tweet))  # remove 'AUDIO:' from start of tweet
    return tweet

def lemmatize(token):
    """Returns lemmatization of a token"""
    return WordNetLemmatizer().lemmatize(token, pos='v')

def tokenize(tweet):
    """Returns tokenized representation of words in lemma form excluding stopwords"""
    result = []
    for token in gensim.utils.simple_preprocess(tweet):
        if token not in gensim.parsing.preprocessing.STOPWORDS \
                and len(token) > 2:  # drops words with less than 3 characters
            result.append(lemmatize(token))
    return result


def preprocess_tweet(tweet):
    """Main master function to clean tweets, stripping noisy characters, and tokenizing use lemmatization"""
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_av(tweet)
    tweet = tweet.lower()  # lower case
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet_token_list = tokenize(tweet)  # apply lemmatization and tokenization
    tweet = ' '.join(tweet_token_list)
    return tweet

def basic_clean(tweet):
    """Main master function to clean tweets only without tokenization or removal of stopwords"""
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_av(tweet)
    tweet = tweet.lower()  # lower case
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet = re.sub('📝 …', '', tweet)
    return tweet

def shal_clean(tweet):
    """Main master function to clean tweets, stripping noisy characters, and tokenizing use lemmatization"""
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = remove_av(tweet)
    tweet = re.sub('[' + punctuation + ']+', ' ', str(tweet))  # strip punctuation
    tweet = re.sub('\s+', ' ', str(tweet))  # remove double spacing
    # tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet = re.sub('📝 …', '', str(tweet))
    tweet_token_list = tokenize(tweet)  # apply lemmatization and tokenization
    tweet = ' '.join(tweet_token_list)
    return tweet

def tokenize_tweets(df):
    """Main function to read in and return cleaned and preprocessed dataframe.
    This can be used in Jupyter notebooks by importing this module and calling the tokenize_tweets() function
    Args:
        df = data frame object to apply cleaning to
    Returns:
        pandas data frame with cleaned tokens
    """

    # df['tokens'] = df.tweet.apply(preprocess_tweet)
    df['tokens'] = df['text'].apply(shal_clean)

    num_tweets = len(df)
    print('Complete. Number of Tweets that have been cleaned and tokenized : {}'.format(num_tweets))
    return df

def tokenize_user_desc(df):
    """Main function to read in and return cleaned and preprocessed dataframe.
    This can be used in Jupyter notebooks by importing this module and calling the tokenize_tweets() function
    Args:
        df = data frame object to apply cleaning to
    Returns:
        pandas data frame with cleaned tokens
    """

    # df['tokens'] = df.tweet.apply(preprocess_tweet)
    df ['tokens_desc'] = df['description'].apply(shal_clean)

    num_tweets = len(df)
    print('Complete. Number of descriptions that have been cleaned and tokenized : {}'.format(num_tweets))
    return df

def tokenize_full_text(df):
    """Main function to read in and return cleaned and preprocessed dataframe.
    This can be used in Jupyter notebooks by importing this module and calling the tokenize_tweets() function
    Args:
        df = data frame object to apply cleaning to
    Returns:
        pandas data frame with cleaned tokens
    """

    # df['tokens'] = df.tweet.apply(preprocess_tweet)
    df ['tokens_full_text'] = df['full_text'].apply(shal_clean)

    num_tweets = len(df)
    print('Complete. Number of full texts that have been cleaned and tokenized : {}'.format(num_tweets))
    return df

# ERROR: 
# C:\Users\sy6sh\Anaconda3\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
# warnings.warn(msg)

In [24]:
df_filtered = pd.read_csv('Melb_filtered_cols.csv', encoding = 'utf-8')
df_filtered = tokenize_tweets(df_filtered)
df_filtered = tokenize_user_desc(df_filtered)
df_filtered = tokenize_full_text(df_filtered)
# To account for this error - TypeError: expected string or bytes-like object - the tokenize tweet clean tweets classes were updated to make the tweet field a string. https://stackoverflow.com/questions/43727583/re-sub-erroring-with-expected-string-or-bytes-like-object

df_filtered.to_csv('Melb__tokenised.csv', encoding = 'utf-8')

Complete. Number of Tweets that have been cleaned and tokenized : 41767
Complete. Number of Tweets that have been cleaned and tokenized : 41767
Complete. Number of Tweets that have been cleaned and tokenized : 41767


In [21]:
df_filtered_user_desc = pd.read_csv('Melb_filtered_cols.csv', encoding = 'utf-8')

df_filtered_user_desc = tokenize_tweets(df_filtered_user_desc)
# To account for this error - TypeError: expected string or bytes-like object - the tokenize tweet clean tweets classes were updated to make the tweet field a string. https://stackoverflow.com/questions/43727583/re-sub-erroring-with-expected-string-or-bytes-like-object

df_filtered_user_desc.to_csv('Melb__user_desc_tokenised.csv', encoding = 'utf-8')

Complete. Number of Tweets that have been cleaned and tokenized : 41767


# Dividing the file into training, test and validation sets
Breaking it down into 7000 tweets to run through geograpy3

In [None]:
https://stackoverflow.com/questions/36445193/splitting-one-csv-into-multiple-files

import os


def split(filehandler, delimiter=',', row_limit=15000:
          output_name_template='output_%s.csv', output_path='.', keep_headers=True):
    import csv
    reader = csv.reader(filehandler, delimiter=delimiter)
    current_piece = 1
    current_out_path = os.path.join(
        output_path,
        output_name_template % current_piece
    )
    current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
    current_limit = row_limit
    if keep_headers:
        headers = reader.next()
        current_out_writer.writerow(headers)
    for i, row in enumerate(reader):
        if i + 1 > current_limit:
            current_piece += 1
            current_limit = row_limit * current_piece
            current_out_path = os.path.join(
                output_path,
                output_name_template % current_piece
            )
            current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
            if keep_headers:
                current_out_writer.writerow(headers)
        current_out_writer.writerow(row)

split(open('/your/pat/input.csv', 'r'));


In [12]:
csvfile = open('Melb__tokenised.csv', 'r', encoding='utf-8').readlines()
>>> filename = 1
>>> for i in range(len(csvfile)):
...     if i % 14000 == 0: #Dividing 40,000 tweets into 3 files approximately the same size. 
...         open(str(filename) + '.csv', 'w+', encoding='utf-8').writelines(csvfile[i:i+1000])
...         filename += 1

# Resulted in 8 460 lined excels

In [8]:
import pandas as pd


#csv file name to be read in 

in_csv = 'Melb__tokenised.csv'


#get the number of lines of the csv file to be read

number_lines = sum(1 for row in (open(in_csv)))


#size of rows of data to write to the csv, 

#you can change the row size according to your need

rowsize = 7000


#start looping through data writing it to a new file for each set

for i in range(1,number_lines,rowsize):

    df = pd.read_csv(in_csv,

          header=None,

         nrows = rowsize,#number of rows to read at each loop

          skiprows = i)#skip rows that have been read


    #csv to write data to a new file with indexed name. input_1.csv etc.

    out_csv = 'input' + str(i) + '.csv'


    df.to_csv(out_csv,

          index=False,

          header=False,

          mode='a',#append data to csv file

          chunksize=rowsize)#size of data to append for each loop

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 2859: character maps to <undefined>

# Geograpy3

In [7]:
import geograpy3
import pandas as pd

df_filtered = pd.read_csv("Melb__tokenised.csv")

df_filtered['place_context'] = df_filtered['tokens'].apply(geograpy.get_place_context(text = 'tokens'))  


ModuleNotFoundError: No module named 'geograpy'