In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk.stem import LancasterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('news.csv')

In [3]:
df.head()

Unnamed: 0,id,headline,text
0,uid-1,Market Advances 5.12 More,NEW YORK (AP) - A prime rate reduction by New ...
1,uid-2,District Boosts Request For Anti-Terrorism Aid...,Mayor Anthony A. Williams petitioned the White...
2,uid-3,"Election? Here's How You Do It, Mate.",From our downunder perspective here in Austral...
3,uid-4,The Biggest Boom Ever,We are about to rewrite history. Unless a rece...
4,uid-5,Economic Aide Sees Uptrend,"Sedate and scholarly Dr. Arthur Burns, the ex-..."


In [4]:
ids = df.id.tolist()

In [5]:
df['data'] = df['headline'] + ' ' + df['text']

In [6]:
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
stemmer = LancasterStemmer()

def clean(text):
    text = text.lower()
    text = text.replace("<br>", '')
    text = text.replace("</br>", '')
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)
    # keep alphabets only
    text = re.sub('[^a-zA-Z ?!]+', '', text)
    # tokenize
    text_token = word_tokenize(text)
    # stopwords removal
    words = [word for word in text_token if word not in stop_words]
    # converting words to basic form 
    stemmed_words = [lemma.lemmatize(word) for word in words]
#     stemmed = [stemmer.stem(word) for word in stemmed_words]
    stemmed = [x for x in stemmed_words if len(x) > 2]
    # join list elements to string
    clean_text = " ".join(stemmed)
    return clean_text

In [7]:
df['data'] = df['data'].apply(lambda x: clean(x))

In [16]:
df['data'][0]

'market advance new york prime rate reduction new york first national city bank helped stock market close busiest week history modest advance friday dow jones average industrial stock gained finishing week withanetrise point big board volume million share fell good deal short matching thursday time high million day figure enough push total week record million share previous peak million last week january dow advanced two first session new year friday close gained point since new year'

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english', max_features=3, max_df=0.001)

data = vectorizer.fit_transform(df.data)              
data1 = data

In [10]:
data.shape

(3000, 3)

In [11]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, random_state=0)
new_data = svd.fit_transform(data)

In [12]:
kmeans = KMeans(n_clusters=2, random_state=42, max_iter=500)
kmeans.fit(new_data)
label = kmeans.labels_
print(kmeans.labels_, kmeans.inertia_, kmeans )

[0 0 0 ... 0 0 0] 2.9969969969969976 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=500,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)


In [13]:
new_df = pd.DataFrame(columns=['id', 'cluster'])
new_df['id'] = ids
new_df['cluster'] = label
new_df.to_csv('submission.csv', index=False)

In [14]:
new_data.shape

(3000, 2)

In [15]:
np.savetxt('submission.txt', new_data)