In [25]:
import gensim.downloader as api

# Download Word2Vec
model = api.load("glove-twitter-25")


In [3]:
# Adjust list of tokens to be equal to SUPPORTED_NUMBER_OF_TOKENS from global file
import pandas as pd

df = pd.read_json('../../output_data/2_tc_nltk_preprocessed.json')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25763 entries, 0 to 25762
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            25763 non-null  object
 1   tag              25763 non-null  object
 2   s1_title_lower   25763 non-null  object
 3   s2_clean_title   25763 non-null  object
 4   s3_tokenized     25763 non-null  object
 5   s4_no_stopwords  25763 non-null  object
 6   s5_lemmatized    25763 non-null  object
dtypes: object(7)
memory usage: 1.4+ MB


In [4]:
df.head(4)

Unnamed: 0,title,tag,s1_title_lower,s2_clean_title,s3_tokenized,s4_no_stopwords,s5_lemmatized
0,My personal ranking of Community's use of alte...,COMMUNITY,my personal ranking of community's use of alte...,my personal ranking of communitys use of alter...,"[my, personal, ranking, of, communitys, use, o...","[personal, ranking, communitys, use, alternati...","[personal, ranking, community, use, alternativ..."
1,It took 30 years for climate tech investments ...,ENVIRONMENT,it took 30 years for climate tech investments ...,it took 30 years for climate tech investments ...,"[it, took, 30, years, for, climate, tech, inve...","[took, 30, years, climate, tech, investments, ...","[took, 30, year, climate, tech, investment, pa..."
2,Rain when sitting in a car,RELAX,rain when sitting in a car,rain when sitting in a car,"[rain, when, sitting, in, a, car]","[rain, sitting, car]","[rain, sitting, car]"
3,Is Cassis worth staying in?,TRAVEL,is cassis worth staying in?,is cassis worth staying in,"[is, cassis, worth, staying, in]","[cassis, worth, staying]","[cassis, worth, staying]"


In [5]:
# Remove unnecessary columns

reduced_df = pd.DataFrame(df.drop(labels=['title', 's1_title_lower', 's2_clean_title', 's3_tokenized', 's4_no_stopwords'], axis='columns', inplace=False))
reduced_df.head(4)

Unnamed: 0,tag,s5_lemmatized
0,COMMUNITY,"[personal, ranking, community, use, alternativ..."
1,ENVIRONMENT,"[took, 30, year, climate, tech, investment, pa..."
2,RELAX,"[rain, sitting, car]"
3,TRAVEL,"[cassis, worth, staying]"


In [6]:
# Rename column that contains lems

reduced_df.rename(columns = {'s5_lemmatized':'lemmatized'}, inplace = True)
reduced_df.head(4)

Unnamed: 0,tag,lemmatized
0,COMMUNITY,"[personal, ranking, community, use, alternativ..."
1,ENVIRONMENT,"[took, 30, year, climate, tech, investment, pa..."
2,RELAX,"[rain, sitting, car]"
3,TRAVEL,"[cassis, worth, staying]"


In [7]:
# Replace every number by word 'number' and mixed letters with number to 'number'

import re


def replace_if_number(word):
    if bool(re.match(r'^\d+$', word)):
        return 'number'
    
    if bool(re.search(r'\d', word)): # Mixed such as 33m
        return 'number' # TODO handle it better

    return word

reduced_df['lemmatized_no_numbers'] = reduced_df['lemmatized'].apply(lambda lem_list: [replace_if_number(lem) for lem in lem_list])

# Set unknown words for word2vec to be 'unknown' and apply model

def apply_word2vec_model(word):
    # Check if the word exists in the model's vocabulary
    if word in model:
        return model[word]
    
    return model['unknown']

reduced_df['vectorized'] = reduced_df['lemmatized_no_numbers'].apply(lambda lem_list: [apply_word2vec_model(lem) for lem in lem_list])

reduced_df.head(4)

Unnamed: 0,tag,lemmatized,lemmatized_no_numbers,vectorized
0,COMMUNITY,"[personal, ranking, community, use, alternativ...","[personal, ranking, community, use, alternativ...","[[0.035234, 0.54248, -0.75981, -0.029171, 1.94..."
1,ENVIRONMENT,"[took, 30, year, climate, tech, investment, pa...","[took, number, year, climate, tech, investment...","[[-0.39819, 0.92849, 1.1194, -0.13217, -0.2980..."
2,RELAX,"[rain, sitting, car]","[rain, sitting, car]","[[-0.94214, -0.24345, 0.20744, 0.21493, -1.066..."
3,TRAVEL,"[cassis, worth, staying]","[cassis, worth, staying]","[[-1.0744, -0.88249, 0.2764, 0.64755, -0.35475..."


In [23]:
# Create single column per each number


vectorized_token_list = reduced_df['vectorized']
print(len(vectorized_token_list))
print(len(vectorized_token_list[0]))
print(len(vectorized_token_list[0][0]))

rows = []
for vectorized_token in vectorized_token_list:
    single_row = {}
    itx = 0
    for vector in vectorized_token:
        for number in vector:
            single_row[f'input_{itx}'] = number
            itx += 1
        
    rows.append(single_row)

final_df = pd.DataFrame(rows)
final_df.info()

final_df.head(5)



25763
6
25
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25763 entries, 0 to 25762
Columns: 1100 entries, input_0 to input_1099
dtypes: float64(1100)
memory usage: 216.2 MB


Unnamed: 0,input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7,input_8,input_9,...,input_1090,input_1091,input_1092,input_1093,input_1094,input_1095,input_1096,input_1097,input_1098,input_1099
0,0.035234,0.54248,-0.75981,-0.029171,1.9414,-0.17883,0.25397,0.49265,-0.00256,-0.43786,...,,,,,,,,,,
1,-0.39819,0.92849,1.1194,-0.13217,-0.29808,-0.52848,1.0614,-0.70803,-0.48539,0.13713,...,,,,,,,,,,
2,-0.94214,-0.24345,0.20744,0.21493,-1.0664,1.1949,1.5234,-0.16527,0.35522,0.4345,...,,,,,,,,,,
3,-1.0744,-0.88249,0.2764,0.64755,-0.35475,0.25591,-0.70326,-1.0045,1.1936,-1.0299,...,,,,,,,,,,
4,-0.177,0.432,0.21391,-0.29515,0.47344,-0.38013,1.6018,0.56238,0.40478,-0.57094,...,,,,,,,,,,


In [24]:
# Concat
# Remove unnecessary columns

reduced_df = pd.DataFrame(df.drop(labels=['vectorized', 'lemmatized_no_numbers', 'lemmatized'], axis='columns', inplace=False))
reduced_df.head(4)

merged_ds = pd.concat([final_df, reduced_df], axis=1, join='inner')
merged_ds.info()
merged_ds.head(5)

KeyError: "['vectorized', 'lemmatized_no_numbers', 'lemmatized'] not found in axis"

In [10]:
merged_ds.to_csv('../../output_data/4_2_vectorized_ds_glove_25.csv')