# 3. Feature Engineering

In [2]:
import pickle
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np

In [3]:
path_df = "Data/News_dataset.pickle"

with open(path_df, 'rb') as data:
    df = pickle.load(data)

In [4]:
df.head()

Unnamed: 0,File_Name,Category,Content,id,News_length
0,Data/bbc\business\001.txt,business,Ad sales boost Time Warner profit\n\nQuarterly...,1,2559
1,Data/bbc\business\002.txt,business,Dollar gains on Greenspan speech\n\nThe dollar...,1,2252
2,Data/bbc\business\003.txt,business,Yukos unit buyer faces loan claim\n\nThe owner...,1,1551
3,Data/bbc\business\004.txt,business,High fuel prices hit BA's profits\n\nBritish A...,1,2401
4,Data/bbc\business\005.txt,business,Pernod takeover talk lifts Domecq\n\nShares in...,1,1569


In [5]:
df.loc[1]['Content']

'Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\n\nAnd Alan Greenspan highlighted the US government\'s willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan\'s speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data. "I think the chairman\'s taking a much more sanguine view on the current account deficit than he\'s taken for some time," said Robert Sinche, head of currency strategy at Bank of America in New York. "He\'s taking a longer-term view, laying out a set of conditions u

## 1. Text Cleaning and Preparation

Berikut adalah langkah-langkah text cleaning dan preprocessing:
1. Hapus karakter khusus '\r', '\n', '"',
2. Ubah ke bentuk lowercase
3. Hapus sejumlah simbol
4. Hapus 's, misal student's name
5. Lematisasi, mengubah ke bentuk kata dasar dengan bantuan wordnet
6. Hapus stopwords

In [6]:
df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")

In [7]:
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')

In [8]:
df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()

In [9]:

punctuation_signs = list("?:!.,;")
df['Content_Parsed_3'] = df['Content_Parsed_2']

for punct_sign in punctuation_signs:
    df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')

In [10]:
# Remove possessive pronouns
df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")

In [11]:
# Downloading punkt and wordnet from NLTK
nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')

------------------------------------------------------------


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rachm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rachm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()

In [13]:
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Content_Parsed_4']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

In [14]:
df['Content_Parsed_5'] = lemmatized_text_list

In [15]:
# Downloading the stop words list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rachm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Loading the stop words in english
stop_words = list(stopwords.words('english'))

In [17]:
stop_words[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [None]:
df['Content_Parsed_6'] = df['Content_Parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')

In [None]:
df.loc[5]['Content']

In [None]:
df.loc[5]['Content_Parsed_1']

In [None]:
df.loc[5]['Content_Parsed_2']

In [None]:
df.loc[5]['Content_Parsed_3']

In [None]:
df.loc[5]['Content_Parsed_4']

In [None]:
df.loc[5]['Content_Parsed_5']

In [None]:
df.loc[5]['Content_Parsed_6']

In [None]:
df.head(1)

In [None]:
list_columns = ["File_Name", "Category", "Content", "Content_Parsed_6"]
df = df[list_columns]

df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})

In [None]:
df.head()

## 2. Label Coding

Mengubah nama kategori ke bentuk angka

In [None]:
category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}

In [None]:
# Category mapping
df['Category_Code'] = df['Category']
df = df.replace({'Category_Code':category_codes})

In [None]:
df.head()

## 3. Train-Test Split

Dari total data yang ada, kita ingin pisahkan ke dalam training data dan testing data.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Content_Parsed'], 
                                                    df['Category_Code'], 
                                                    test_size=0.15, 
                                                    random_state=8)

In [None]:
print(len(df))
print(len(X_train))
print(len(X_test))

## 4. Text Representation

In [None]:
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [None]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

In [None]:
from sklearn.feature_selection import chi2
import numpy as np

for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

In [None]:
category_codes.items()

In [None]:
bigrams

In [None]:
# X_train
with open('Data/X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)
    
# X_test    
with open('Data/X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)
    
# y_train
with open('Data/y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
# y_test
with open('Data/y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
# df
with open('Data/df.pickle', 'wb') as output:
    pickle.dump(df, output)
    
# features_train
with open('Data/features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

# labels_train
with open('Data/labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

# features_test
with open('Data/features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)

# labels_test
with open('Data/labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)
    
# TF-IDF object
with open('Data/tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)

In [None]:
# X_train
X_train.to_csv(f'Data/X_train.csv', index = False)
    
# X_test
X_test.to_csv(f'Data/X_test.csv', index = False)
    
# y_train
y_train.to_csv(f'Data/y_train.csv', index = False)
    
# y_test
y_test.to_csv(f'Data/y_test.csv', index = False)
    
# df
df.to_csv(f'Data/df.csv', index = False)

# features_train
np.savetxt(f"Data/features_train.csv", features_train, delimiter=",")

# labels_train
labels_train.to_csv(f'Data/labels_train.csv')

# features_test
np.savetxt(f"Data/features_test.csv", features_test, delimiter=",")

# labels_test
labels_test.to_csv(f'Data/labels_test.csv')

# TF-IDF object
tfidf.to_csv(f'Data/tfidf.csv')