In [1]:
import gensim.downloader as api

# Download Word2Vec
model = api.load("word2vec-google-news-300")


In [2]:
# Adjust list of tokens to be equal to SUPPORTED_NUMBER_OF_TOKENS from global file
import pandas as pd

df = pd.read_json('../../output_data/2_tc_nltk_preprocessed.json')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25763 entries, 0 to 25762
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            25763 non-null  object
 1   tag              25763 non-null  object
 2   s1_title_lower   25763 non-null  object
 3   s2_clean_title   25763 non-null  object
 4   s3_tokenized     25763 non-null  object
 5   s4_no_stopwords  25763 non-null  object
 6   s5_lemmatized    25763 non-null  object
dtypes: object(7)
memory usage: 1.4+ MB


In [3]:
df.head(4)

Unnamed: 0,title,tag,s1_title_lower,s2_clean_title,s3_tokenized,s4_no_stopwords,s5_lemmatized
0,My personal ranking of Community's use of alte...,COMMUNITY,my personal ranking of community's use of alte...,my personal ranking of communitys use of alter...,"[my, personal, ranking, of, communitys, use, o...","[personal, ranking, communitys, use, alternati...","[personal, ranking, community, use, alternativ..."
1,It took 30 years for climate tech investments ...,ENVIRONMENT,it took 30 years for climate tech investments ...,it took 30 years for climate tech investments ...,"[it, took, 30, years, for, climate, tech, inve...","[took, 30, years, climate, tech, investments, ...","[took, 30, year, climate, tech, investment, pa..."
2,Rain when sitting in a car,RELAX,rain when sitting in a car,rain when sitting in a car,"[rain, when, sitting, in, a, car]","[rain, sitting, car]","[rain, sitting, car]"
3,Is Cassis worth staying in?,TRAVEL,is cassis worth staying in?,is cassis worth staying in,"[is, cassis, worth, staying, in]","[cassis, worth, staying]","[cassis, worth, staying]"


In [4]:
# Remove unnecessary columns

reduced_df = pd.DataFrame(df.drop(labels=['title', 's1_title_lower', 's2_clean_title', 's3_tokenized', 's4_no_stopwords'], axis='columns', inplace=False))
reduced_df.head(4)

Unnamed: 0,tag,s5_lemmatized
0,COMMUNITY,"[personal, ranking, community, use, alternativ..."
1,ENVIRONMENT,"[took, 30, year, climate, tech, investment, pa..."
2,RELAX,"[rain, sitting, car]"
3,TRAVEL,"[cassis, worth, staying]"


In [5]:
# Rename column that contains lems

reduced_df.rename(columns = {'s5_lemmatized':'lemmatized'}, inplace = True)
reduced_df.head(4)

Unnamed: 0,tag,lemmatized
0,COMMUNITY,"[personal, ranking, community, use, alternativ..."
1,ENVIRONMENT,"[took, 30, year, climate, tech, investment, pa..."
2,RELAX,"[rain, sitting, car]"
3,TRAVEL,"[cassis, worth, staying]"


In [6]:
# Replace every number by word 'number' and mixed letters with number to 'number'

import re


def replace_if_number(word):
    if bool(re.match(r'^\d+$', word)):
        return 'number'
    
    if bool(re.search(r'\d', word)): # Mixed such as 33m
        return 'number'

    return word

reduced_df['lemmatized_no_numbers'] = reduced_df['lemmatized'].apply(lambda lem_list: [replace_if_number(lem) for lem in lem_list])

# Set unknown words for word2vec to be 'unknown' and apply model

def apply_word2vec_model(word):
    # Check if the word exists in the model's vocabulary
    if word in model:
        return model[word]
    
    return model['unknown']

reduced_df['vectorized'] = reduced_df['lemmatized_no_numbers'].apply(lambda lem_list: [apply_word2vec_model(lem) for lem in lem_list])

reduced_df.head(4)

Unnamed: 0,tag,lemmatized,lemmatized_no_numbers,vectorized
0,COMMUNITY,"[personal, ranking, community, use, alternativ...","[personal, ranking, community, use, alternativ...","[[0.1875, 0.01574707, -0.05859375, 0.083496094..."
1,ENVIRONMENT,"[took, 30, year, climate, tech, investment, pa...","[took, number, year, climate, tech, investment...","[[0.11376953, 0.110839844, 0.04321289, -0.1943..."
2,RELAX,"[rain, sitting, car]","[rain, sitting, car]","[[-0.05102539, 0.045898438, -0.2734375, -0.259..."
3,TRAVEL,"[cassis, worth, staying]","[cassis, worth, staying]","[[0.20800781, -0.076660156, -0.00038909912, 0...."


In [7]:
# Create single column per each number


vectorized_token_list = reduced_df['vectorized']
print(len(vectorized_token_list))
print(len(vectorized_token_list[0]))
print(len(vectorized_token_list[0][0]))

rows = []
for vectorized_token in vectorized_token_list:
    single_row = {}
    itx = 0
    for vector in vectorized_token:
        for number in vector:
            single_row[f'input_{itx}'] = number
            itx += 1
        
    rows.append(single_row)

final_df = pd.DataFrame(rows)
final_df.info()

final_df.head(5)



25763
6
300
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25763 entries, 0 to 25762
Columns: 13200 entries, input_0 to input_13199
dtypes: float64(13200)
memory usage: 2.5 GB


Unnamed: 0,input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7,input_8,input_9,...,input_13190,input_13191,input_13192,input_13193,input_13194,input_13195,input_13196,input_13197,input_13198,input_13199
0,0.1875,0.015747,-0.058594,0.083496,-0.063477,0.122559,0.12793,0.002411,0.263672,-0.108398,...,,,,,,,,,,
1,0.11377,0.11084,0.043213,-0.194336,0.140625,-0.460938,0.022217,-0.084961,-0.047852,0.025757,...,,,,,,,,,,
2,-0.051025,0.045898,-0.273438,-0.259766,0.347656,0.026611,-0.217773,-0.182617,-0.24707,0.398438,...,,,,,,,,,,
3,0.208008,-0.07666,-0.000389,0.123535,0.316406,-0.28125,0.251953,-0.326172,0.026123,0.486328,...,,,,,,,,,,
4,0.011292,0.028931,0.083496,-0.049805,-0.130859,-0.109863,-0.115234,-0.074707,-0.089355,0.21875,...,,,,,,,,,,


In [8]:
# Concat
# Remove unnecessary columns

reduced_df = pd.DataFrame(reduced_df.drop(labels=['vectorized', 'lemmatized_no_numbers', 'lemmatized'], axis='columns', inplace=False))
reduced_df.head(4)

merged_ds = pd.concat([final_df, reduced_df], axis=1, join='inner')
merged_ds.info()
merged_ds.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25763 entries, 0 to 25762
Columns: 13201 entries, input_0 to tag
dtypes: float64(13200), object(1)
memory usage: 2.5+ GB


Unnamed: 0,input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7,input_8,input_9,...,input_13191,input_13192,input_13193,input_13194,input_13195,input_13196,input_13197,input_13198,input_13199,tag
0,0.1875,0.015747,-0.058594,0.083496,-0.063477,0.122559,0.12793,0.002411,0.263672,-0.108398,...,,,,,,,,,,COMMUNITY
1,0.11377,0.11084,0.043213,-0.194336,0.140625,-0.460938,0.022217,-0.084961,-0.047852,0.025757,...,,,,,,,,,,ENVIRONMENT
2,-0.051025,0.045898,-0.273438,-0.259766,0.347656,0.026611,-0.217773,-0.182617,-0.24707,0.398438,...,,,,,,,,,,RELAX
3,0.208008,-0.07666,-0.000389,0.123535,0.316406,-0.28125,0.251953,-0.326172,0.026123,0.486328,...,,,,,,,,,,TRAVEL
4,0.011292,0.028931,0.083496,-0.049805,-0.130859,-0.109863,-0.115234,-0.074707,-0.089355,0.21875,...,,,,,,,,,,ASTRONOMY


In [9]:
merged_ds.to_csv('../../output_data/4_2_vectorized_ds.csv')