# Import Libraries

In [1]:
## Import Libraries and Dependencies ##
import glob

import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)

# Path

In [2]:
INPUT_PATH = "D:/OneDrive - National University of Singapore/NUS MTech KE/MTech KE - FYP - InsureSense/Kang Jiang/Phase 3/System Implementation/scripts/Data Mining & Machine Learning/dataset/"

In [3]:
OUTPUT_PATH = "D:/OneDrive - National University of Singapore/NUS MTech KE/MTech KE - FYP - InsureSense/Kang Jiang/Phase 3/System Implementation/scripts/Data Mining & Machine Learning/dataset/cleaned_tweets/"

# Defined Functions

In [4]:
# Function to import user location data for three disaster events
def import_data(folder_name):
    # Japan flood
    # Typhoon Jebi
    # Typhoon Mangkhut

    path = INPUT_PATH + folder_name
    allFiles = glob.glob(path + "\\*.csv")

    df_list = []
    for file in allFiles:
        df = pd.read_csv(file, header=0, engine='python').iloc[:, :]
        df_list.append(df)


    df = pd.concat(df_list).reset_index(drop=True)
    
    return df

# Join tweets with user location data

In [5]:
# Import data
df_sentiment = pd.read_csv(INPUT_PATH + "/sentiment_analysis/df_Japan_Floods_sentiment_labelled.csv").drop(columns=['Unnamed: 0'])
# df_sentiment = pd.read_csv(INPUT_PATH + "/sentiment_analysis/df_Typhoon_Jebi_sentiment_labelled.csv").drop(columns=['Unnamed: 0'])
# df_sentiment = pd.read_csv(INPUT_PATH + "/sentiment_analysis/df_Typhoon_Mangkhut_sentiment_labelled.csv").drop(columns=['Unnamed: 0'])

In [6]:
df_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36988 entries, 0 to 36987
Data columns (total 14 columns):
tweet_id           36988 non-null float64
user               36988 non-null object
timestamp          36988 non-null object
date               36988 non-null object
events             36988 non-null object
text               36988 non-null object
processed_text     36988 non-null object
likes              36988 non-null int64
replies            36988 non-null int64
retweets           36988 non-null int64
url                36988 non-null object
disaster_flag      36988 non-null int64
disaster_phase     36988 non-null int64
sentiment_final    36988 non-null float64
dtypes: float64(2), int64(5), object(7)
memory usage: 4.0+ MB


In [7]:
# Import user location data
df_location = import_data("/user_location/Japan Floods")
# df_location = import_data("/user_location/Typhoon Jebi")
# df_location = import_data("/user_location/Typhoon Mangkhut")

In [8]:
df_location.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10747 entries, 0 to 10746
Data columns (total 4 columns):
user         10747 non-null object
location     10747 non-null object
latitude     10747 non-null float64
longitude    10747 non-null float64
dtypes: float64(2), object(2)
memory usage: 335.9+ KB


In [9]:
# Merge the disaster data with user location data
df_with_user_location_cleaned = df_sentiment.merge(df_location, on=['user'], how='left')

# Create tweet links column

In [10]:
df_with_user_location_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36988 entries, 0 to 36987
Data columns (total 17 columns):
tweet_id           36988 non-null float64
user               36988 non-null object
timestamp          36988 non-null object
date               36988 non-null object
events             36988 non-null object
text               36988 non-null object
processed_text     36988 non-null object
likes              36988 non-null int64
replies            36988 non-null int64
retweets           36988 non-null int64
url                36988 non-null object
disaster_flag      36988 non-null int64
disaster_phase     36988 non-null int64
sentiment_final    36988 non-null float64
location           24637 non-null object
latitude           24637 non-null float64
longitude          24637 non-null float64
dtypes: float64(4), int64(5), object(8)
memory usage: 5.1+ MB


In [11]:
# create tweet links from url column
df_with_user_location_cleaned['tweet_link'] = "https://twitter.com" + df_with_user_location_cleaned['url']
df_with_user_location_cleaned = df_with_user_location_cleaned.drop(columns=['url'])

In [12]:
df_with_user_location_cleaned.head(100)

Unnamed: 0,tweet_id,user,timestamp,date,events,text,processed_text,likes,replies,retweets,disaster_flag,disaster_phase,sentiment_final,location,latitude,longitude,tweet_link
0,1.005360e+18,@jjwalsh,6/9/2018 7:47,6/9/2018,Japan Floods,"Nice day by the river in #Kobe on this beautiful, sunny weather Saturday \r\r\r\r\nAllowing access to water is a nice feature not often seen in Japan pic.twitter.com/8WCQfGIjbm",nice day river kobe beautiful sunny weather saturday allowing access water nice feature often seen japan pic.twitter.com/8WCQfGIjbm,26,3,3,1,3,0.841667,æ—¥æœ¬,36.574844,139.239418,https://twitter.com/jjwalsh/status/1005355731665629184
1,1.005370e+18,@metalheadbazaar,6/9/2018 8:29,6/9/2018,Japan Floods,Marduk - To Tour Japan In November - Metal Storm http://www.metalstorm.net/events/news_comments.php?news_id=33763ÃÂÃÂÃÂÃÂ ÃÂÃÂ¢ÃÂ¢ÃÂÃÂ¬ÃÂÃÂ¦,marduk tour japan november metal storm,0,0,0,1,3,-0.500000,"London, Greater London, England, SW1A 2DU, UK",51.507322,-0.127647,https://twitter.com/metalheadbazaar/status/1005366203618156546
2,1.005370e+18,@wordwidetroll,6/9/2018 8:45,6/9/2018,Japan Floods,Yestarday storm give me http://www.irvinakatech.comÃÂÃÂÃÂÃÂ \r\r\r\r\n#ads\r\r\r\r\n#adsense\r\r\r\r\n#money #moneyguru\r\r\r\r\n#japan\r\r\r\r\n#germany\r\r\r\r\n#makemoneyonline\r\r\r\r\n#adsensetrick\r\r\r\r\n#adsenseguru\r\r\r\r\n#travelerblogger\r\r\r\r\n#payments\r\r\r\r\n#payment\r\r\r\r\n#ads\r\r\r\r\n#ads_designpic.twitter.com/hHsQ4laLQy,yestarday storm give ad adsense money moneyguru japan germany makemoneyonline adsensetrick adsenseguru travelerblogger payment payment ad ads_designpic .twitter.com/hhsq4lalqy,0,0,0,1,3,-0.500000,,,,https://twitter.com/wordwidetroll/status/1005370364757725184
3,1.005370e+18,@kazuotamakashi,6/9/2018 9:00,6/9/2018,Japan Floods,"Rain tomorrow, at Nara City, Japan! With a high of 22C and a low of 18C.",rain tomorrow nara city japan high 22c low 18c,0,0,0,1,1,-0.460000,"å¥ˆè‰¯çœŒ, è¿‘ç•¿åœ°æ–¹, æ—¥æœ¬",34.296309,135.881682,https://twitter.com/kazuotamakashi/status/1005373968650571778
4,1.005370e+18,@shukyudo_travel,6/9/2018 9:00,6/9/2018,Japan Floods,Rain tomorrow! With a high of 79F and a low of 70F. #japan #osaka #travel #lp,rain tomorrow high 79f low 70f japan osaka travel lp,0,0,0,1,1,-0.460000,"å¤§é˜ªå¸‚, å¤§é˜ªåºœ, è¿‘ç•¿åœ°æ–¹, æ—¥æœ¬",34.693757,135.501454,https://twitter.com/shukyudo_travel/status/1005373973142622210
5,1.005350e+18,@sarahmldnenbr,6/9/2018 7:19,6/9/2018,Japan Floods,"Typical tourists style?! Don't miss some fine days in the rainy season. Rain, rain, rain, sunny, sunny, rain, rain, rain, sunny, sunny... Sort of. The rainy season 'tsuyu' or 'baiu' of Japan (east Asia) is such a thing. Long but it has sunny days.",typical tourist style don't miss fine day rainy season rain rain rain sunny sunny rain rain rain sunny sunny ... sort rainy season tsuyu baiu japan east asia thing long sunny day,3,0,0,1,1,-0.466667,"(possibly), Saint Josen, Town of Rochester, Ulster County, New York, USA",41.757741,-74.219736,https://twitter.com/sarahmldnenbr/status/1005348620235706368
6,1.005350e+18,@Stratus_Fire,6/9/2018 7:18,6/9/2018,Japan Floods,"#LateNightThoughts the cities ravaged in the 2011 Japan Tsunami were able to turn around a rebuild their city better than it looked befoe in 7 years time, yet the damage done by Hurricane Katrina in 2005 still isn't near remotely fixed. Why?",latenightthoughts city ravaged 2011 japan tsunami able turn around rebuild city better looked befoe 7 year time yet damage done hurricane katrina 2005 still isn't near remotely fixed,1,0,0,1,2,-0.350000,"North Carolina, USA",35.672964,-79.039292,https://twitter.com/Stratus_Fire/status/1005348268522594305
7,1.005400e+18,@espgws,6/9/2018 10:49,6/9/2018,Japan Floods,"ÃÂÃÂ£ÃÂ¢ÃÂÃÂ¬ÃÂ¯ÃÂ¿ÃÂ½GWS TVÃÂÃÂ£ÃÂ¢ÃÂÃÂ¬ÃÂ¢ÃÂÃÂ Highly Recommended!! BABYMETAL(@BABYMETAL_JAPAN ) NO RAIN, NO RAINBOW https://youtu.be/pDgqo6fcliYÃÂÃÂÃÂÃÂ @mikio158cm @TakayoshiOhmura @YouTubepic.twitter.com/qnoYIReA2O",gws tv highly recommended babymetal rain rainbow,17,0,6,1,2,-0.420000,,,,https://twitter.com/espgws/status/1005401381086248960
8,1.005390e+18,@Rainolaguer,6/9/2018 9:56,6/9/2018,Japan Floods,There's no free livestream for Aqours 3rd live :(\r\r\r\r\n\r\r\r\r\nThe only way I stay updated is via people tweeting what's currently happening and there's a flood of tweets every few seconds.\r\r\r\r\n\r\r\r\r\nhrngghh take me to Japan now pls pic.twitter.com/RrgobKPLPf,there's free livestream aqours 3rd live way stay updated via people tweeting what's currently happening there's flood tweet every second hrngghh take japan pls pic.twitter.com/rrgobkplpf,4,0,0,1,3,-0.446364,"Benilde, Toral de los Vados, El Bierzo, LeÃ³n, Castilla y LeÃ³n, 24560, EspaÃ±a",42.567351,-6.782827,https://twitter.com/Rainolaguer/status/1005388064506535938
9,1.005260e+18,@okthunderkgirl,6/9/2018 1:22,6/9/2018,Japan Floods,This is amazing! So much love from our thunder brothers and sisters in Japan,amazing much love thunder brother sister japan,0,0,0,1,3,-0.225000,,,,https://twitter.com/okthunderkgirl/status/1005258757692121088


In [13]:
# df_with_user_location_cleaned.to_csv(OUTPUT_PATH + "df_Japan_Floods_final.csv")
# df_with_user_location_cleaned.to_csv(OUTPUT_PATH + "df_Typhoon_Jebi_final.csv")
# df_with_user_location_cleaned.to_csv(OUTPUT_PATH + "df_Typhoon_Mangkhut_final.csv")