In [3]:
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore')
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Importing cleaned dataset
wikihow = pd.read_csv('./datasets/clean_wikihow_sep.csv')

In [5]:
wikihow = wikihow[['text_id', 'sentence', 'is_summary', 'words', 'title', 'sentence_len']]

In [7]:
wikihow.head()

Unnamed: 0,text_id,sentence,is_summary,words,title,sentence_len
0,0,Sell yourself first,yes,"['Sell', 'yourself', 'first']",How to Sell Fine Art Online,3
1,0,"Before doing anything else, stop and sum up yo...",no,"['Before', 'doing', 'anything', 'else,', 'stop...",How to Sell Fine Art Online,12
2,0,"Now, think about how to translate that to an o...",no,"['Now,', 'think', 'about', 'how', 'to', 'trans...",How to Sell Fine Art Online,11
3,0,"Be it the few words, Twitter allows you or an ...",no,"['Be', 'it', 'the', 'few', 'words,', 'Twitter'...",How to Sell Fine Art Online,21
4,0,Bring out the most salient features of your cr...,no,"['Bring', 'out', 'the', 'most', 'salient', 'fe...",How to Sell Fine Art Online,18


## Surface Features

Surface features are usually features based on structure of documents or sentences like position in document or paragraph, length (number of words in the sentence) or number of quoted words in a sentence. This comes with the intuition that sentences are more important is the number of words with theh exception of stopwords is within a certain range x or the sentences containing too many quoted words are less important. 

In [8]:
# Delete nas just in case
wikihow = wikihow.dropna()

## Content Features
### Computing TF-IDF sentence scores


In [9]:
#extracting paragraphs
docs = wikihow.groupby('text_id')['sentence'].apply(list)

In [11]:
docs.head()

text_id
0    [Sell yourself first, Before doing anything el...
1    [Read the classics before 1600, Reading the cl...
2    [Join online artist communities, Depending on ...
3    [Make yourself public, Get yourself out there ...
4    [Blog about your artwork, Given the hundreds o...
Name: sentence, dtype: object

In [16]:
tf_idf_list = []
vectorizer = TfidfVectorizer()
for doc in docs:
    vectors = vectorizer.fit_transform(doc).toarray()
    for vector in vectors:
        tfidf_sent = np.sum(vector)
        tf_idf_list.append(tfidf_sent)
len(tf_idf_list)

7284000

In [18]:
wikihow['tfidf_score'] = tf_idf_list

In [21]:
wikihow.head()

Unnamed: 0,text_id,sentence,is_summary,words,title,sentence_len,tfidf_score
0,0,Sell yourself first,yes,"['Sell', 'yourself', 'first']",How to Sell Fine Art Online,3,1.725841
1,0,"Before doing anything else, stop and sum up yo...",no,"['Before', 'doing', 'anything', 'else,', 'stop...",How to Sell Fine Art Online,12,3.421057
2,0,"Now, think about how to translate that to an o...",no,"['Now,', 'think', 'about', 'how', 'to', 'trans...",How to Sell Fine Art Online,11,3.064425
3,0,"Be it the few words, Twitter allows you or an ...",no,"['Be', 'it', 'the', 'few', 'words,', 'Twitter'...",How to Sell Fine Art Online,21,4.393483
4,0,Bring out the most salient features of your cr...,no,"['Bring', 'out', 'the', 'most', 'salient', 'fe...",How to Sell Fine Art Online,18,3.500784


## Semantic Similarity Features 