# Importing Required Libraries

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import parse
import multiprocessing
from bs4 import BeautifulSoup # Text Cleaning
import re, string # Regular Expressions, String
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # stopwords
from nltk.stem.porter import PorterStemmer # for word stemming
from nltk.stem import WordNetLemmatizer # for word lemmatization
from pad_sequences import pad_sequences_multi
import unicodedata
import html
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Creating a function to get all the app names

In [37]:
path = r'./data'
def getAppNames(path):
    list_of_files = os.listdir(path)
    app_names = []
    for file in list_of_files:
        name = parse.parse('submission_{}.csv',file)
        app_names.append(str(name)[10:-7])
    
    return app_names

appNames = getAppNames(path)
appNames

['candidate', 'jobget', 'jobseeker', 'jobswipe', 'mobile']

##### To create the final_submission.csv file, building a data pipeline function that takes path and app names as input and returns the final dataframe.

In [38]:
def creating_df(path, appNames):
    dfs = []
    for file in os.listdir(path):
        print("Reading "+file)
        data = pd.read_csv(path+'/'+ file)
        df = pd.DataFrame(data)
        print("Current Data Frame shape ")
        print(df.shape)
        df['appName'] = appNames[os.listdir(path).index(file)]
        dfs.append(df)
    print("Total Files found: ", len(dfs))
    final_df = pd.DataFrame()
    for i in range(len(dfs)):
        final_df = pd.concat([final_df, dfs[i]], axis=0)
        
    return final_df
    
final_df = creating_df(path, appNames)
final_df

Reading submission_candidate.csv
Current Data Frame shape 
(143, 10)
Reading submission_jobget.csv
Current Data Frame shape 
(3382, 10)
Reading submission_jobseeker.csv
Current Data Frame shape 
(8909, 10)
Reading submission_jobswipe.csv
Current Data Frame shape 
(697, 10)
Reading submission_mobile.csv
Current Data Frame shape 
(105, 10)
Total Files found:  5


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appName
0,d505016e-3b37-4ba2-97cf-16c9ef54f84c,David Webster,https://play-lh.googleusercontent.com/a-/ACB-R...,Found great bar jobs on here in the past but n...,1,0,3.2.11,2023-02-08 19:41:20,,,candidate
1,eae5efd4-26eb-469d-b793-f9689510ebbf,Joshua Espinoza,https://play-lh.googleusercontent.com/a/AGNmyx...,Try to upload resume and it just stays uploading!,2,0,3.2.6,2022-09-26 04:18:21,,,candidate
2,c7156674-d0bf-497e-924e-07802e9cda9d,Robert Basom,https://play-lh.googleusercontent.com/a-/ACB-R...,"This is one of the worst apps I've ever used,w...",1,0,,2022-07-20 08:11:02,,,candidate
3,520a4e2f-d352-4fe4-a63b-ceea058663bf,Daniel DLS,https://play-lh.googleusercontent.com/a-/ACB-R...,If I cant upload a resume because for whatever...,1,1,3.2.5,2022-06-27 23:48:52,,,candidate
4,254d1260-7a30-4ae1-8c68-53f597c3c84c,Morris Kagunya,https://play-lh.googleusercontent.com/a-/ACB-R...,It's not logging in,1,0,3.2.3,2022-05-16 22:59:26,,,candidate
...,...,...,...,...,...,...,...,...,...,...,...
100,45d71edd-c573-4bc2-a219-ac7e31cb90ad,Alan J,https://play-lh.googleusercontent.com/a-/ACB-R...,Rather tasty,5,7,1.0.1.364,2021-11-14 16:15:49,,,mobile
101,6d98a697-0814-46f1-b8b8-fbbcb3f9f3b3,Tracy Nguyen,https://play-lh.googleusercontent.com/a-/ACB-R...,I am about to do it myself to make it happen a...,5,9,,2021-11-13 03:00:17,,,mobile
102,8bbd9f45-73b4-42e0-8057-d4fc362c121a,Jeremy Yatchmenoff,https://play-lh.googleusercontent.com/a/AGNmyx...,The best,5,5,1.0.1.364,2021-11-03 15:02:20,,,mobile
103,6a62c0a3-c52d-4900-9fd4-32a10c277afe,David Shaw,https://play-lh.googleusercontent.com/a-/ACB-R...,Nice app,5,8,1.0.1.364,2021-10-22 06:44:09,,,mobile


In [39]:
final_df.to_csv(path+'/'+'final_submission.csv', index=False) 
print("Final Submission File SAVED!")
print("Final File Shape: ", final_df.shape)
final_df.head()

Final Submission File SAVED!
Final File Shape:  (13236, 11)


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appName
0,d505016e-3b37-4ba2-97cf-16c9ef54f84c,David Webster,https://play-lh.googleusercontent.com/a-/ACB-R...,Found great bar jobs on here in the past but n...,1,0,3.2.11,2023-02-08 19:41:20,,,candidate
1,eae5efd4-26eb-469d-b793-f9689510ebbf,Joshua Espinoza,https://play-lh.googleusercontent.com/a/AGNmyx...,Try to upload resume and it just stays uploading!,2,0,3.2.6,2022-09-26 04:18:21,,,candidate
2,c7156674-d0bf-497e-924e-07802e9cda9d,Robert Basom,https://play-lh.googleusercontent.com/a-/ACB-R...,"This is one of the worst apps I've ever used,w...",1,0,,2022-07-20 08:11:02,,,candidate
3,520a4e2f-d352-4fe4-a63b-ceea058663bf,Daniel DLS,https://play-lh.googleusercontent.com/a-/ACB-R...,If I cant upload a resume because for whatever...,1,1,3.2.5,2022-06-27 23:48:52,,,candidate
4,254d1260-7a30-4ae1-8c68-53f597c3c84c,Morris Kagunya,https://play-lh.googleusercontent.com/a-/ACB-R...,It's not logging in,1,0,3.2.3,2022-05-16 22:59:26,,,candidate


In [40]:
#creating a nested condition to insert values according to the rating
# if score equal to 3 then neutral
# if score greater than 3 then positive
# if score less than 3 then negative
final_df['sentiment'] = np.where(final_df['score'] == 3, 'neutral', np.where(final_df['score'] <3, 'negative', np.where(final_df['score'] >3, 'positive', np.nan)))
final_df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appName,sentiment
0,d505016e-3b37-4ba2-97cf-16c9ef54f84c,David Webster,https://play-lh.googleusercontent.com/a-/ACB-R...,Found great bar jobs on here in the past but n...,1,0,3.2.11,2023-02-08 19:41:20,,,candidate,negative
1,eae5efd4-26eb-469d-b793-f9689510ebbf,Joshua Espinoza,https://play-lh.googleusercontent.com/a/AGNmyx...,Try to upload resume and it just stays uploading!,2,0,3.2.6,2022-09-26 04:18:21,,,candidate,negative
2,c7156674-d0bf-497e-924e-07802e9cda9d,Robert Basom,https://play-lh.googleusercontent.com/a-/ACB-R...,"This is one of the worst apps I've ever used,w...",1,0,,2022-07-20 08:11:02,,,candidate,negative
3,520a4e2f-d352-4fe4-a63b-ceea058663bf,Daniel DLS,https://play-lh.googleusercontent.com/a-/ACB-R...,If I cant upload a resume because for whatever...,1,1,3.2.5,2022-06-27 23:48:52,,,candidate,negative
4,254d1260-7a30-4ae1-8c68-53f597c3c84c,Morris Kagunya,https://play-lh.googleusercontent.com/a-/ACB-R...,It's not logging in,1,0,3.2.3,2022-05-16 22:59:26,,,candidate,negative
...,...,...,...,...,...,...,...,...,...,...,...,...
100,45d71edd-c573-4bc2-a219-ac7e31cb90ad,Alan J,https://play-lh.googleusercontent.com/a-/ACB-R...,Rather tasty,5,7,1.0.1.364,2021-11-14 16:15:49,,,mobile,positive
101,6d98a697-0814-46f1-b8b8-fbbcb3f9f3b3,Tracy Nguyen,https://play-lh.googleusercontent.com/a-/ACB-R...,I am about to do it myself to make it happen a...,5,9,,2021-11-13 03:00:17,,,mobile,positive
102,8bbd9f45-73b4-42e0-8057-d4fc362c121a,Jeremy Yatchmenoff,https://play-lh.googleusercontent.com/a/AGNmyx...,The best,5,5,1.0.1.364,2021-11-03 15:02:20,,,mobile,positive
103,6a62c0a3-c52d-4900-9fd4-32a10c277afe,David Shaw,https://play-lh.googleusercontent.com/a-/ACB-R...,Nice app,5,8,1.0.1.364,2021-10-22 06:44:09,,,mobile,positive


In [41]:
#Checking if any row has null values
final_df['sentiment'].isnull().values.sum() 

0

# Data Preprocessing

In [42]:
# Creating a cleaning function

# set of stopwords to be removed from text
stop = set(stopwords.words('english'))

# update stopwords to have punctuation too
stop.update(list(string.punctuation))

def clean_text(text):
    
    # Remove unwanted html characters
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
    'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
    '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
    ' @-@ ', '-').replace('\\', ' \\ ')
    text = re1.sub(' ', html.unescape(x1))
    
    # remove non-ascii characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    #     # strip html
    #     soup = BeautifulSoup(text, 'html.parser')
    #     text = soup.get_text()
    
    # remove between square brackets
    text = re.sub('\[[^]]*\]', '', text)
    
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # remove twitter tags
    text = text.replace("@", "")
    
    # remove hashtags
    text = text.replace("#", "")
    
    # remove all non-alphabetic characters
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
    # remove stopwords from text
    final_text = []
    for word in text.split():
        if word.strip().lower() not in stop:
            final_text.append(word.strip().lower())
    
    text = " ".join(final_text)
    
    # lemmatize words
    lemmatizer = WordNetLemmatizer()    
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    text = " ".join([lemmatizer.lemmatize(word, pos = 'v') for word in text.split()])
    
    # replace all numbers with "num"
    text = re.sub("\d", "num", text)
    
    return text.lower()

In [43]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13236 entries, 0 to 104
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              13236 non-null  object
 1   userName              13236 non-null  object
 2   userImage             13236 non-null  object
 3   content               13227 non-null  object
 4   score                 13236 non-null  int64 
 5   thumbsUpCount         13236 non-null  int64 
 6   reviewCreatedVersion  10501 non-null  object
 7   at                    13236 non-null  object
 8   replyContent          3396 non-null   object
 9   repliedAt             3396 non-null   object
 10  appName               13236 non-null  object
 11  sentiment             13236 non-null  object
dtypes: int64(2), object(10)
memory usage: 1.3+ MB


In [44]:
final_df = final_df[final_df['content'].notna()]
final_df.shape

(13227, 12)

In [45]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13227 entries, 0 to 104
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              13227 non-null  object
 1   userName              13227 non-null  object
 2   userImage             13227 non-null  object
 3   content               13227 non-null  object
 4   score                 13227 non-null  int64 
 5   thumbsUpCount         13227 non-null  int64 
 6   reviewCreatedVersion  10495 non-null  object
 7   at                    13227 non-null  object
 8   replyContent          3395 non-null   object
 9   repliedAt             3395 non-null   object
 10  appName               13227 non-null  object
 11  sentiment             13227 non-null  object
dtypes: int64(2), object(10)
memory usage: 1.3+ MB


In [46]:
final_df = final_df.drop(['reviewCreatedVersion', 'replyContent', 'repliedAt'], axis=1)

In [47]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13227 entries, 0 to 104
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   reviewId       13227 non-null  object
 1   userName       13227 non-null  object
 2   userImage      13227 non-null  object
 3   content        13227 non-null  object
 4   score          13227 non-null  int64 
 5   thumbsUpCount  13227 non-null  int64 
 6   at             13227 non-null  object
 7   appName        13227 non-null  object
 8   sentiment      13227 non-null  object
dtypes: int64(2), object(7)
memory usage: 1.0+ MB


In [69]:
clean_df = final_df.copy()
clean_df['content'] = final_df['content'].apply(clean_text)

In [49]:
#Getting all the 4 and 5 rated reviews
high_rated_df = pd.concat([final_df[final_df['score'] == 4], final_df[final_df['score'] == 5]], axis=0)
high_rated_df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,at,appName,sentiment
6,098e9eef-cd64-41d4-b9e4-a46fc1356dda,Kadar Harris,https://play-lh.googleusercontent.com/a/AGNmyx...,Great,4,0,2022-03-25 23:27:33,candidate,positive
31,0b3d424c-edd3-4d76-bce1-d3238ed7d35d,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,It ok.,4,0,2020-01-31 22:57:34,candidate,positive
32,a2f921b2-049e-4e4a-afff-d44a0ead6cee,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Great app but i think it needs an update with ...,4,0,2019-11-07 03:53:56,candidate,positive
56,1ca21a55-95af-45b5-a823-9f55fb86426c,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Glitchy. Profile updating is annoying as posit...,4,0,2019-09-17 21:29:28,candidate,positive
59,dccb8488-14bc-4af7-b3df-59efd4df8752,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Excellen app,4,0,2019-08-08 19:40:28,candidate,positive
...,...,...,...,...,...,...,...,...,...
100,45d71edd-c573-4bc2-a219-ac7e31cb90ad,Alan J,https://play-lh.googleusercontent.com/a-/ACB-R...,Rather tasty,5,7,2021-11-14 16:15:49,mobile,positive
101,6d98a697-0814-46f1-b8b8-fbbcb3f9f3b3,Tracy Nguyen,https://play-lh.googleusercontent.com/a-/ACB-R...,I am about to do it myself to make it happen a...,5,9,2021-11-13 03:00:17,mobile,positive
102,8bbd9f45-73b4-42e0-8057-d4fc362c121a,Jeremy Yatchmenoff,https://play-lh.googleusercontent.com/a/AGNmyx...,The best,5,5,2021-11-03 15:02:20,mobile,positive
103,6a62c0a3-c52d-4900-9fd4-32a10c277afe,David Shaw,https://play-lh.googleusercontent.com/a-/ACB-R...,Nice app,5,8,2021-10-22 06:44:09,mobile,positive


##### Vocabulary size

In [50]:
vocabsize = 100000

In [51]:
high_rated_df.content

6                                                  Great
31                                                It ok.
32     Great app but i think it needs an update with ...
56     Glitchy. Profile updating is annoying as posit...
59                                          Excellen app
                             ...                        
100                                         Rather tasty
101    I am about to do it myself to make it happen a...
102                                             The best
103                                             Nice app
104    Been using the website for awhile and it's the...
Name: content, Length: 10430, dtype: object

In [52]:
list(high_rated_df.content)

['Great',
 'It ok.',
 'Great app but i think it needs an update with google cause google cant find places where i work at and with no work experience recognized the app wont let me send applications and with job applications the app is basically useless.',
 "Glitchy. Profile updating is annoying as positions have to be chosen from a list of a thousand. Can't enter end date to some jobs unless I use the website. Also when entering jobs, sometimes just the address is shown, not the name of the business. Great for applying though",
 'Excellen app',
 "I find jobs on here just fine. But you guys need to fix the bugs. I can't set up interviews if your app doesn't work correctly",
 'bugs fixed great one of the best apps for hospitality work just wish more jobs in Queens Ny would start to use app',
 'Nice app',
 'Good',
 'Great for the big city Answer hubdreds of listings first week Schedule \ndozens of interviews first month',
 'Wish you could still view the ad after you have applied.',
 'Gre

In [56]:
from keybert import KeyBERT

In [57]:
keyModel = KeyBERT()

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [75]:
clean_high_rated_df = pd.concat([clean_df[clean_df['score'] == 4], clean_df[clean_df['score'] == 5]], axis=0)
clean_high_rated_df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,at,appName,sentiment
6,098e9eef-cd64-41d4-b9e4-a46fc1356dda,Kadar Harris,https://play-lh.googleusercontent.com/a/AGNmyx...,great,4,0,2022-03-25 23:27:33,candidate,positive
31,0b3d424c-edd3-4d76-bce1-d3238ed7d35d,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,ok,4,0,2020-01-31 22:57:34,candidate,positive
32,a2f921b2-049e-4e4a-afff-d44a0ead6cee,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,great app think need update google cause googl...,4,0,2019-11-07 03:53:56,candidate,positive
56,1ca21a55-95af-45b5-a823-9f55fb86426c,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,glitchy profile update annoy position choose l...,4,0,2019-09-17 21:29:28,candidate,positive
59,dccb8488-14bc-4af7-b3df-59efd4df8752,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,excellen app,4,0,2019-08-08 19:40:28,candidate,positive
...,...,...,...,...,...,...,...,...,...
100,45d71edd-c573-4bc2-a219-ac7e31cb90ad,Alan J,https://play-lh.googleusercontent.com/a-/ACB-R...,rather tasty,5,7,2021-11-14 16:15:49,mobile,positive
101,6d98a697-0814-46f1-b8b8-fbbcb3f9f3b3,Tracy Nguyen,https://play-lh.googleusercontent.com/a-/ACB-R...,make happen sake course work together make sur...,5,9,2021-11-13 03:00:17,mobile,positive
102,8bbd9f45-73b4-42e0-8057-d4fc362c121a,Jeremy Yatchmenoff,https://play-lh.googleusercontent.com/a/AGNmyx...,best,5,5,2021-11-03 15:02:20,mobile,positive
103,6a62c0a3-c52d-4900-9fd4-32a10c277afe,David Shaw,https://play-lh.googleusercontent.com/a-/ACB-R...,nice app,5,8,2021-10-22 06:44:09,mobile,positive


In [81]:
doc = ''
for comnt in clean_high_rated_df['content']:
    doc += comnt+'. '
doc

'great. ok. great app think need update google cause google cant find place work work experience recognize app wont let send application job application app basically useless. glitchy profile update annoy position choose list thousand cant enter end date job unless use website also enter job sometimes address show name business great apply though. excellen app. find job fine guy need fix bug cant set interview app doesnt work correctly. bug fix great one best apps hospitality work wish job queen ny would start use app. nice app. good. great big city answer hubdreds list first week schedule dozen interview first month. wish could still view ad apply. great. great far. beautiful app guess still develop doesnt message capability available full site absolutely love site app. awaand x x thes za costcos dead. think app much better actually earn money. easy use. help find many job would never think area good pay. app really good find good job. far would say one thorough helpful job search app

In [82]:
keyModel.extract_keywords(doc, keyphrase_ngram_range=(1,3), stop_words=None)

[('job apps good', 0.6892),
 ('suggest app jobemployment', 0.679),
 ('job apps great', 0.6783),
 ('better job apps', 0.6751),
 ('job app better', 0.6736)]

In [83]:
keyModel.extract_keywords(doc, keyphrase_ngram_range=(1,5), stop_words=None)

[('find job good great app', 0.7417),
 ('find job need excellent app', 0.7411),
 ('good app opportunity get job', 0.736),
 ('get excellent app find job', 0.7359),
 ('good app find job good', 0.7344)]

In [84]:
keyModel.extract_keywords(doc, keyphrase_ngram_range=(1,1), stop_words='english',
                         use_mmr=True, diversity=0.7)

[('jobsearching', 0.4677),
 ('applay', 0.2156),
 ('opinion', 0.1018),
 ('integrate', 0.0882),
 ('coronavirus', 0.0126)]

In [85]:
keyModel.extract_keywords(doc, keyphrase_ngram_range=(1,1), stop_words='english',
                         use_maxsum=True, nr_candidates=20, top_n=5)

[('jobswip', 0.3889),
 ('appsuper', 0.3985),
 ('jobemployment', 0.4102),
 ('jobsearch', 0.4394),
 ('apps', 0.4427)]

In [54]:
corpus = []
corpus = high_rated_df.content.copy()
corpus

6                                                  Great
31                                                It ok.
32     Great app but i think it needs an update with ...
56     Glitchy. Profile updating is annoying as posit...
59                                          Excellen app
                             ...                        
100                                         Rather tasty
101    I am about to do it myself to make it happen a...
102                                             The best
103                                             Nice app
104    Been using the website for awhile and it's the...
Name: content, Length: 10430, dtype: object