In [10]:
import pandas as pd
import numpy as np
import re
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [11]:
def load_fasttext_file(path, n_rows=None):
    texts, labels = [], []
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            if n_rows and i >= n_rows:
                break
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                labels.append(parts[0])
                texts.append(parts[1])
    return pd.DataFrame({'label': labels, 'text': texts})

train = load_fasttext_file('data/raw/train.ft.txt')
test  = load_fasttext_file('data/raw/test.ft.txt')

print(train.shape, test.shape)
train.head()


(41611, 2) (37087, 2)


Unnamed: 0,label,text
0,__label__2,Stuning even for the non-gamer: This sound tra...
1,__label__2,The best soundtrack ever to anything.: I'm rea...
2,__label__2,Amazing!: This soundtrack is my favorite music...
3,__label__2,Excellent Soundtrack: I truly like this soundt...
4,__label__2,"Remember, Pull Your Jaw Off The Floor After He..."


In [12]:
label_map = {
    '__label__1': 0,
    '__label__2': 1
}

train['label'] = train['label'].map(label_map)
test['label']  = test['label'].map(label_map)

train['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,21162
0,20449


In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [14]:
train['clean_text'] = train['text'].apply(clean_text)
test['clean_text']  = test['text'].apply(clean_text)

train[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,Stuning even for the non-gamer: This sound tra...,stuning even for the nongamer this sound track...
1,The best soundtrack ever to anything.: I'm rea...,the best soundtrack ever to anything im readin...
2,Amazing!: This soundtrack is my favorite music...,amazing this soundtrack is my favorite music o...
3,Excellent Soundtrack: I truly like this soundt...,excellent soundtrack i truly like this soundtr...
4,"Remember, Pull Your Jaw Off The Floor After He...",remember pull your jaw off the floor after hea...


In [15]:
os.makedirs('data/processed', exist_ok=True)

train[['clean_text', 'label']].to_csv(
    'data/processed/train_clean.csv',
    index=False
)

test[['clean_text', 'label']].to_csv(
    'data/processed/test_clean.csv',
    index=False
)


In [16]:
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(train['clean_text'])
X_test_tfidf  = tfidf.transform(test['clean_text'])

y_train = train['label'].values
y_test  = test['label'].values

X_train_tfidf.shape, X_test_tfidf.shape


((41611, 20000), (37087, 20000))

In [17]:
from scipy.sparse import save_npz

save_npz('data/processed/X_train_tfidf.npz', X_train_tfidf)
save_npz('data/processed/X_test_tfidf.npz', X_test_tfidf)

np.save('data/processed/y_train.npy', y_train)
np.save('data/processed/y_test.npy', y_test)


In [18]:
train_transformer = train[['clean_text', 'label']]
test_transformer  = test[['clean_text', 'label']]

train_transformer.head()


Unnamed: 0,clean_text,label
0,stuning even for the nongamer this sound track...,1
1,the best soundtrack ever to anything im readin...,1
2,amazing this soundtrack is my favorite music o...,1
3,excellent soundtrack i truly like this soundtr...,1
4,remember pull your jaw off the floor after hea...,1


In [19]:
train_transformer.to_csv(
    'data/processed/train_transformer.csv',
    index=False
)

test_transformer.to_csv(
    'data/processed/test_transformer.csv',
    index=False
)
