In [1]:
import numpy as np
# import pandas as pd

In [2]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report

In [4]:
import pickle

In [5]:
import pprint
import os
import sys
sys.path.append('../')
from src.load_data import load_data

## Load Data

In [6]:
train_data, valid_data, test_data, metadata = load_data()

In [7]:
number_stopwords = [str(i) for i in range(10001)] + ['0'+str(i) for i in range(100)] + ['000']

In [8]:
scoring = 'f1_macro'
n_jobs=10


In [9]:
tra_sents = np.array([sentence['sentence']
     for article in train_data
     for sentence in article['sentences']])
y_tra = np.array([sentence['label'] for article in train_data for sentence in article['sentences']])

In [10]:
opt_sents = np.array([sentence['sentence']
     for article in (train_data + valid_data)
     for sentence in article['sentences']])

In [11]:
y_opt = np.array([sentence['label'] for article in (train_data + valid_data) for sentence in article['sentences']])

In [12]:
test_sents =  np.array([sentence['sentence']
     for article in test_data
     for sentence in article['sentences']])

In [13]:
y_test = np.array([sentence['label'] for article in test_data for sentence in article['sentences']])

In [14]:
len(tra_sents), len(y_tra), len(test_sents), len(y_test)

(3582, 3582, 441, 441)

# Feature Extraction

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

We can see that best TFIDFVectorizer features are as listed below;
    min_df: 0.001
    max_df: 0.6
    stop_words: num_stopwords

In [102]:
0.001*10818

10.818

In [96]:
vectorizer = TfidfVectorizer(min_df=0.0001 ,max_df=0.12, stop_words=number_stopwords)

In [97]:
tfidf_vectors = vectorizer.fit_transform(opt_sents)

In [98]:
vectorizer.vocabulary_

{'arrest': 660,
 'isi': 4763,
 'spies': 9134,
 'have': 4140,
 'created': 2245,
 'panic': 6808,
 'among': 471,
 'people': 6975,
 'dmk': 2835,
 'president': 7326,
 '14th': 20,
 'september': 8680,
 'pm': 7140,
 'chennai': 1713,
 'chief': 1725,
 'karunanidhi': 5078,
 'today': 9841,
 'their': 9728,
 'activities': 177,
 'tamil': 9587,
 'nadu': 6224,
 'has': 4127,
 'asked': 704,
 'why': 10627,
 'no': 6447,
 'arrests': 663,
 'been': 1050,
 'made': 5590,
 'connection': 2054,
 'with': 10663,
 'may': 5816,
 'twin': 10031,
 'bomb': 1279,
 'blasts': 1233,
 'train': 9914,
 'central': 1610,
 'railway': 7649,
 'station': 9237,
 'which': 10612,
 'woman': 10677,
 'killed': 5159,
 'referring': 7896,
 'alleged': 393,
 'spy': 9168,
 'arun': 677,
 'selvarjan': 8648,
 'last': 5339,
 'week': 10578,
 'national': 6293,
 'investigation': 4721,
 'agency': 281,
 'it': 4787,
 'not': 6478,
 'first': 3530,
 'time': 9812,
 'such': 9389,
 'terrorists': 9696,
 'had': 4047,
 'apprehended': 596,
 'during': 2937,
 'aiadmk'

In [99]:
tra_vectors = vectorizer.transform(tra_sents)
test_vectors = vectorizer.transform(test_sents)

In [100]:
len(vectorizer.vocabulary_)

10804

#### Saving Feature Vectors

In [19]:
import pickle

feature_path = 'Data/features_mindf_001_maxdf_0_12_number_stopwords.pickle'

with open(feature_path, 'wb') as file_:
    pickle.dump(tfidf_vectors, file_, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
tfidf_vectors.shape

(3981, 3010)