# Feature Extraction


## Import Packages and Data

In [1]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings("ignore")

In [24]:
data = pd.read_csv("Data/Preprocessed_Data.csv")
data.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong,Tweet_Parsed
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0,cowok usaha lacak perhati gue lantas remeh per...
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0,rt telat tau edan sarap gue gaul cigax jifla c...
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,0,0,0,0,0,0,0,0,0,0,kadang pikir percaya tuhan jatuh kali kali kad...
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,0,0,0,0,0,0,0,0,0,0,ku tau mata sipit lihat
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,0,1,1,0,0,0,0,0,1,0,kaum cebong kafir lihat dongok dungu hahahah


In [25]:
pd.isnull(data).sum()

Tweet            0
HS               0
Abusive          0
HS_Individual    0
HS_Group         0
HS_Religion      0
HS_Race          0
HS_Physical      0
HS_Gender        0
HS_Other         0
HS_Weak          0
HS_Moderate      0
HS_Strong        0
Tweet_Parsed     0
dtype: int64

## Train Test Split

In [26]:
X = data['Tweet_Parsed']
y = data.drop(['Tweet','Tweet_Parsed'],axis = 1).values

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=8)

## TF-IDF

In [34]:
# Parameter election
ngram_range = (1,1)
min_df = 1
max_df = 1.
max_features = 300

In [35]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)

In [36]:
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

(11193, 300)


In [37]:
features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(1976, 300)


array([[1, 0, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 0, 0]], dtype=int64)

In [42]:
fitur = tfidf.get_feature_names()
target = data.drop(['Tweet','Tweet_Parsed'],axis = 1).columns

In [44]:
data_train = pd.DataFrame(data=features_train,columns=fitur)
data_train = data_train.join(pd.DataFrame(data=y_train,columns=target))
data_train.head()

Unnamed: 0,adil,aduh,agama,ah,ahok,ajak,ajar,akun,al,allah,...,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.525837,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,1,0,0


In [45]:
data_test = pd.DataFrame(data=features_test,columns=fitur)
data_test = data_test.join(pd.DataFrame(data=y_test,columns=target))
data_test.head()

Unnamed: 0,adil,aduh,agama,ah,ahok,ajak,ajar,akun,al,allah,...,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Save File

In [46]:
data_train.to_csv('Data/Data_train.csv', index = False)
data_test.to_csv('Data/Data_test.csv', index = False)