## Data preprocessing and sampling for classification

#### Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import json
import collections 
import matplotlib.pyplot as plt
import itertools
import re
import nltk
import gensim

In [2]:
#read in raw data
col_list = ["text"]
df_all=pd.read_csv("Merged240520to200620.csv")
df = pd.read_csv("Merged240520to200620.csv", usecols = col_list)
print(len(df))

78606


In [3]:
# to see full column content instead of truncated text
pd.set_option('display.max_colwidth', -1)

In [4]:
# first x rows
print(df["text"].head(10))


0    #Millennials\nMove to @CityOfMemphis - great place for those who #WorkFromHome\nhttps://t.co/XTkAY3vSPX                                                                                                                                                                                               
1    Billy Porter - Engaging Politically &amp; Creatively During COVID-19 | The Daily Social Distancing Show #digitalmarketing #workfromhome https://t.co/evEDrXTTLU                                                                                                                                       
2    Surprising ! Real job\nhttps://t.co/5079HFQq5x\n#WorkFromHome                                                                                                                                                                                                                                         
3    Exciting ! Appen job !\nhttps://t.co/Rzxwiy1ryb\n#appen \n#WorkFromHome                        

In [5]:
#install tweet preprocessor package to remove URLs, hashtags, mentions
#pip install tweet-preprocessor

import preprocessor as p


In [5]:
#import stoplist from file

stop=pd.read_csv("text_stopwords.csv")
print(len(stop))
#print(stop.head())

stop2=stop.Stopwords.tolist()
print(stop2)


39
['u', 'amp', 'th', 'via', 'pm', 'e', 'work', 'working', 'home', 'like', 'youre', 'must', 'gt', 'hey', 'k', 'doesnt', 'im', 'wfh', 'dont', 'june', 'many', 'day', 'time', 'week', 'make', 'back', 'may', 'going', 'today', 'thanks', 'friday', 'get', 'want', 'need', 'please', 'must', 'know', 'remote', 'office']


In [27]:
stop_list = nltk.corpus.stopwords.words('english')
#to add on to stop_list - used nltk as stop list is in a list. 

#on 15 July 
stop_list += stop2

In [29]:
all_tweets1 = []
all_tweets2=[]
all_tweets3=[]
all_tweets4=[] #for sentiment analysis: no stemming and stopwords
all_tweets5=[] # remove stopwords
all_tweets6=[] #lemmitised instead of stemming

lemmatizer = nltk.stem.WordNetLemmatizer()

for row in df["text"]:
    
    # remove all urls,mentions, hashtags using twitter preprocessing p
    tweets1 = p.clean(row)
    all_tweets1.append(tweets1)

    #remove digits,punctuations
   
    tweets2=re.sub(r"\b\d+\b","",tweets1)
    all_tweets2.append(tweets2)
    
    
    #tokenise 
    tweets3=list(gensim.utils.tokenize(tweets2))
    all_tweets3.append(tweets3)
    
    #convert to lower case

    tweets4 = [word.lower() for word in tweets3]
    all_tweets4.append(tweets4)
    
    
    #remove stopwords
    tweets5= [word for word in tweets4 if word not in stop_list]
    all_tweets5.append(tweets5)
    
    
    #lemmatise
    tweets6 = [lemmatizer.lemmatize(word) for word in tweets5]
    all_tweets6.append(tweets6)

    
processed=pd.DataFrame({'sentiment':all_tweets4,'topic':all_tweets6})
cleaned = pd.concat([df_all, processed], axis=1)
cleaned.to_csv('Cleaned240520to200620_v2.csv', sep=',', index=False)

#### Get word frequency distribution of raw cleaned data

In [23]:
#convert column topic into nested list (from cleaned v1 dataset)

token = cleaned['topic'].tolist()
#print(token)

# List of all words across tweets - flatten list into 1 list
all_words = list(itertools.chain(*token))

# Create counter
counts =collections.Counter(all_words)

#get top 10 words - can add into stoplist
counts.most_common(10)

[('home', 17427),
 ('work', 15158),
 ('working', 10779),
 ('new', 7539),
 ('remote', 6306),
 ('amp', 6214),
 ('time', 6028),
 ('office', 5889),
 ('day', 5715),
 ('business', 5454)]

#### Get random tweets for subsequent classification

In [153]:
#get random 150 tweets for manual labelling - on 8 July

#sample = cleaned.sample(n=150)
#sample.to_csv('sampletest_sentiment.csv', sep=',', index=False)

In [3]:
#get identifiers of the sample - row_num
col_list = ["row_num"]
sample1 = pd.read_csv('sampletest_sentiment.csv', usecols=col_list)
sample1a=sample1['row_num'].tolist()

print(sample1a)


[54429, 48642, 49822, 26453, 46244, 2596, 43279, 68800, 35778, 65330, 42824, 2432, 56470, 69909, 68856, 28665, 54521, 4844, 5706, 62382, 42136, 3909, 60065, 23631, 49677, 78193, 3226, 56794, 51532, 23407, 41324, 9503, 76686, 60066, 806, 13521, 73756, 15933, 42456, 4643, 35079, 76755, 65687, 9302, 60762, 23788, 65983, 51561, 54472, 72700, 61018, 46159, 46459, 25924, 57552, 56462, 74715, 61359, 25551, 10621, 64036, 1860, 17004, 45554, 76836, 30509, 49654, 19663, 27710, 23780, 4699, 38724, 40993, 73701, 39831, 62385, 212, 23639, 33581, 12060, 19492, 44478, 33913, 55270, 44073, 4009, 17385, 37084, 18756, 15126, 58648, 70932, 55161, 49214, 57852, 6354, 28697, 8172, 54342, 63773, 22802, 65052, 57304, 17218, 15266, 31905, 33614, 39644, 21135, 65162, 69073, 67643, 47906, 59184, 46243, 21970, 71839, 66847, 57588, 65117, 19234, 67773, 29718, 51284, 74947, 57892, 3890, 69401, 74302, 54411, 40443, 33892, 58462, 70387, 8194, 54333, 6940, 2772, 8437, 52921, 36406, 30491, 28227, 19424, 73520, 9592, 3

In [4]:
#get cleaned sample 1

sample_train = cleaned[cleaned.row_num.isin(sample1a)]
sample_train.to_csv('sample_train_class.csv', sep=',', index=False)

In [44]:
#get 2nd sample for testing - 15 july

#sample2 = cleaned[~cleaned.row_num.isin(sample1a)]
#print(len(sample2))

#sample_test = sample2.sample(n=150)
#sample_test.to_csv('sample_test_class.csv', sep=',', index=False)

78456


In [4]:
#get identifiers of the sample - row_num
col_list = ["row_num"]
sample3 = pd.read_csv('sample_test_class.csv', usecols=col_list)
sample3a=sample3['row_num'].tolist()

print(sample3a)

[32152, 5800, 43971, 66761, 7808, 13845, 28018, 50182, 49585, 47442, 40752, 16848, 35764, 16410, 65450, 9443, 62272, 61049, 6447, 11679, 54763, 56206, 67042, 67505, 58875, 25080, 76693, 65495, 20456, 19922, 3237, 71640, 6343, 12244, 76026, 43707, 61998, 65942, 35654, 76087, 19059, 66882, 4259, 29798, 25805, 890, 26905, 18082, 51951, 12018, 19245, 71791, 17857, 33630, 5604, 9047, 26639, 77005, 28915, 24448, 38281, 44974, 26574, 67271, 18012, 20367, 27876, 43321, 18058, 35243, 54284, 30988, 55327, 56894, 31768, 67545, 78208, 44446, 70580, 57172, 47964, 23691, 69121, 19490, 29971, 13956, 13447, 15083, 28799, 12655, 11003, 61062, 8458, 75840, 56421, 2756, 41450, 56864, 76849, 75093, 47602, 9457, 45296, 17111, 28300, 77299, 63404, 20429, 53364, 42426, 29977, 48899, 32131, 31794, 7853, 16797, 71141, 47427, 41143, 33110, 50688, 25174, 6525, 13763, 29761, 35758, 66852, 27234, 74627, 47911, 44309, 21127, 35064, 13803, 36978, 62911, 60753, 29787, 40291, 38106, 26568, 58894, 49569, 68594, 58721, 

In [5]:
#get another 450 data for training - 16 July 
total_ex = sample1a + sample3a


sample4 = cleaned[~cleaned.row_num.isin(total_ex)]
print(len(sample4))

sample4 = sample4.sample(n=450)
sample4.to_csv('sample_train_class2.csv', sep=',', index=False)

78306


In [6]:
#get identifiers of the sample - row_num
col_list = ["row_num"]
sample4 = pd.read_csv('sample_train_class2.csv', usecols=col_list)
sample4a=sample4['row_num'].tolist()

print(sample4a)

[75470, 15212, 15414, 76976, 47009, 58219, 45035, 20542, 44647, 4876, 75709, 36770, 9762, 53661, 55826, 65346, 76826, 43559, 46949, 74416, 24827, 51988, 36452, 42579, 24542, 13745, 27262, 17396, 24317, 3699, 61249, 28730, 74568, 40793, 20905, 31581, 34498, 12314, 75669, 46885, 41637, 11183, 39509, 32516, 18766, 23953, 40849, 49240, 29897, 5935, 23827, 32854, 49937, 37720, 27503, 36326, 78575, 58801, 9283, 49216, 49178, 49047, 39023, 63759, 14809, 24483, 24492, 49428, 53772, 61880, 70911, 12002, 24803, 12154, 13917, 45493, 75822, 31816, 43549, 14330, 8581, 43949, 59567, 19135, 9603, 33564, 16129, 22253, 29497, 38929, 19219, 38703, 11298, 5127, 62607, 2522, 75615, 59063, 10895, 48085, 34556, 3379, 34772, 48390, 42057, 28485, 55941, 54836, 58068, 20691, 68195, 68575, 78476, 50950, 5429, 31071, 46047, 2666, 40550, 46735, 54057, 18894, 448, 68503, 30493, 29253, 16666, 72639, 45149, 33016, 56772, 30756, 9570, 59609, 40673, 40968, 113, 77141, 57571, 24355, 23920, 74464, 35021, 52410, 27013, 2

In [7]:
#get scoring data - i.e. n - 750

total_ex = sample1a + sample3a + sample4a
#print(total_ex)
#print(len(total_ex))

classi = cleaned[~cleaned.row_num.isin(total_ex)]
classi.to_csv('data_classification_v2.csv', sep=',', index=False)

print(len(classi))

77856


In [2]:
cleaned=pd.read_csv("Cleaned240520to200620_v2.csv")

cleaned.head()

Unnamed: 0,week_num,day_num,row_num,userid,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,hashtags,tweet_cursor_id,sentiment,topic
0,1,1,1,856662300.0,throckad,"Breast surgical oncologist in Memphis, TN. Van...","Memphis, TN",76,427,1201,1/10/2012 16:48,24/5/2020 23:59,0,#Millennials\nMove to @CityOfMemphis - great p...,"[{'text': 'Millennials', 'indices': [0, 12]}, ...",1.26471e+18,"['to', 'great', 'place', 'for', 'those', 'who']","['great', 'place']"
1,1,1,2,1.25402e+18,sandra_alvareze,Get cutting edge strategies and tips for peopl...,Miami,104,92,664,25/4/2020 12:04,24/5/2020 23:57,0,Billy Porter - Engaging Politically &amp; Crea...,"[{'text': 'digitalmarketing', 'indices': [104,...",1.26471e+18,"['billy', 'porter', 'engaging', 'politically',...","['billy', 'porter', 'engaging', 'politically',..."
2,1,1,3,1.25021e+18,healthadding,"i am affiliate marketing, copy writing",,1129,227,762,14/4/2020 23:48,24/5/2020 23:56,0,Surprising ! Real job\nhttps://t.co/5079HFQq5x...,"[{'text': 'WorkFromHome', 'indices': [46, 59]}]",1.26471e+18,"['surprising', 'real', 'job']","['surprising', 'real', 'job']"
3,1,1,4,1.25021e+18,healthadding,"i am affiliate marketing, copy writing",,1129,227,762,14/4/2020 23:48,24/5/2020 23:56,0,Exciting ! Appen job !\nhttps://t.co/Rzxwiy1ry...,"[{'text': 'appen', 'indices': [47, 53]}, {'tex...",1.26471e+18,"['exciting', 'appen', 'job']","['exciting', 'appen', 'job']"
4,1,1,5,4347212000.0,parkhurstheidi,Iowa Market President at Bank of America. Resi...,"Coal Valley, IL",1844,710,3156,24/11/2015 17:38,24/5/2020 23:56,0,This article brought a smile to my face! Great...,"[{'text': 'WFH', 'indices': [155, 159]}]",1.26471e+18,"['this', 'article', 'brought', 'a', 'smile', '...","['article', 'brought', 'smile', 'face', 'great..."
