In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [13]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:

_wnl = nltk.WordNetLemmatizer()


def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric

    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]

def preprocess(headlines,bodies):
  n_headlines, n_bodies =[],[]
  for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
    clean_headline = clean(headline)
    clean_body = clean(body)
    clean_headline = get_tokenized_lemmas(clean_headline)
    clean_body = get_tokenized_lemmas(clean_body)
    clean_headline = remove_stopwords(clean_headline)
    clean_body = remove_stopwords(clean_body)
    n_headlines.append(headline)
    n_bodies.append(body)
  n_headlines_df=pd.DataFrame(n_headlines,columns=['Headline'])
  n_bodies_df=pd.DataFrame(n_bodies,columns=['Body'])
  return n_headlines_df['Headline'], n_bodies_df['Body']


In [24]:
def statistical_features(dataset_loc):
  dataset = pd.read_csv(dataset_loc)
  stop_words_l=stopwords.words('english')
  headlines = dataset['Headline']
  bodies = dataset['Body']
  print(type(headlines))
  headlines,bodies = preprocess(headlines,bodies)
  print(type(headlines))
  headline_vectorizer = TfidfVectorizer()
  h = headline_vectorizer.fit_transform(headlines)
  body_vectorizer = TfidfVectorizer(max_features=10000-h.shape[1])
  b = body_vectorizer.fit_transform(bodies)
  statistical_features = np.concatenate((np.array(h.toarray()),np.array(b.toarray())),axis = 1)
  return statistical_features

In [25]:
statistical_features_train = statistical_features('gdrive/MyDrive/CS626/Project/Data/train_Set.csv')

<class 'pandas.core.series.Series'>


49972it [02:46, 299.29it/s]


<class 'pandas.core.series.Series'>


In [26]:
np.count_nonzero(statistical_features_train[500])

314

In [27]:
statistical_features_test = statistical_features('gdrive/MyDrive/CS626/Project/Data/test_Set.csv')

<class 'pandas.core.series.Series'>


25413it [01:21, 312.24it/s]


<class 'pandas.core.series.Series'>


In [None]:
statistical_features_test.shape

(25413, 10000)

In [29]:
np.save(arr=statistical_features_test,file='gdrive/MyDrive/CS626/Project/Data/test_statistical_features.npy')

In [30]:
np.save(arr=statistical_features_train,file='gdrive/MyDrive/CS626/Project/Data/train_statistical_features.npy')

In [28]:
np.count_nonzero(statistical_features_test[0])

189