In [1]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
df_tr = pd.read_csv("train.csv").drop(["id","highlights"], axis = 1)
df_te = pd.read_csv("test.csv").drop(["id","highlights"], axis = 1)
df_va = pd.read_csv("validation.csv").drop(["id","highlights"], axis = 1)

In [4]:
df = pd.concat([df_tr, df_te, df_va], ignore_index=True)

In [5]:
def preprocess(text):
    text = text.strip() #Remove White Spaces
    text = text.lower() #Lower Case
    text = ' '.join(text.split()) #Singly Spaced Words
    return text

In [6]:
def tokenize(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return tokens

In [7]:
df = df.head(10)

In [8]:
print(df)

                                             article
0  By . Associated Press . PUBLISHED: . 14:11 EST...
1  (CNN) -- Ralph Mata was an internal affairs li...
2  A drunk driver who killed a young woman in a h...
3  (CNN) -- With a breezy sweep of his pen Presid...
4  Fleetwood are the only team still to have a 10...
5  He's been accused of making many a fashion fau...
6  By . Daily Mail Reporter . PUBLISHED: . 01:15 ...
7  By . Daily Mail Reporter . This is the moment ...
8  There are a number of job descriptions waiting...
9  Canberra, Australia (CNN) -- At first glance, ...


In [9]:
df['processed_text'] = df['article'].apply(preprocess)
df['tokens'] = df['processed_text'].apply(tokenize)
print(df)

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   
2  A drunk driver who killed a young woman in a h...   
3  (CNN) -- With a breezy sweep of his pen Presid...   
4  Fleetwood are the only team still to have a 10...   
5  He's been accused of making many a fashion fau...   
6  By . Daily Mail Reporter . PUBLISHED: . 01:15 ...   
7  By . Daily Mail Reporter . This is the moment ...   
8  There are a number of job descriptions waiting...   
9  Canberra, Australia (CNN) -- At first glance, ...   

                                      processed_text  \
0  by . associated press . published: . 14:11 est...   
1  (cnn) -- ralph mata was an internal affairs li...   
2  a drunk driver who killed a young woman in a h...   
3  (cnn) -- with a breezy sweep of his pen presid...   
4  fleetwood are the only team still to have a 10...   
5  he's been accused of making many a fashion f

In [10]:
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
print(df)

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   
2  A drunk driver who killed a young woman in a h...   
3  (CNN) -- With a breezy sweep of his pen Presid...   
4  Fleetwood are the only team still to have a 10...   
5  He's been accused of making many a fashion fau...   
6  By . Daily Mail Reporter . PUBLISHED: . 01:15 ...   
7  By . Daily Mail Reporter . This is the moment ...   
8  There are a number of job descriptions waiting...   
9  Canberra, Australia (CNN) -- At first glance, ...   

                                      processed_text  \
0  . associated press . published : . 14:11 est ,...   
1  ( cnn ) -- ralph mata internal affairs lieuten...   
2  drunk driver killed young woman head - crash c...   
3  ( cnn ) -- breezy sweep pen president vladimir...   
4  fleetwood team 100 % record sky bet league 2 -...   
5  accused making fashion faux pas holiday . pr

In [11]:
df['summary'] = df['processed_text'].apply(lambda x: list(nlp(x).sents)[:2])
df['summary'] = df['summary'].apply(lambda x: ''.join([str(s) for s in x]))
print(df)

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   
2  A drunk driver who killed a young woman in a h...   
3  (CNN) -- With a breezy sweep of his pen Presid...   
4  Fleetwood are the only team still to have a 10...   
5  He's been accused of making many a fashion fau...   
6  By . Daily Mail Reporter . PUBLISHED: . 01:15 ...   
7  By . Daily Mail Reporter . This is the moment ...   
8  There are a number of job descriptions waiting...   
9  Canberra, Australia (CNN) -- At first glance, ...   

                                      processed_text  \
0  . associated press . published : . 14:11 est ,...   
1  ( cnn ) -- ralph mata internal affairs lieuten...   
2  drunk driver killed young woman head - crash c...   
3  ( cnn ) -- breezy sweep pen president vladimir...   
4  fleetwood team 100 % record sky bet league 2 -...   
5  accused making fashion faux pas holiday . pr

In [12]:
df[['summary']].to_csv('cnn_summary.csv', index=False)