In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model, svm
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
import re

In [2]:
data = pd.read_csv("data/train.csv")
description = data['Description'][0:100]

In [3]:
# converting all letters to lower or upper case  
for i,item in description.iteritems():
    if type(item) != float:
        description[i] = item.lower()

In [4]:
# converting numbers into words or removing numbers  
for i,item in description.iteritems():
    if type(item) != float:
        description[i] = re.sub(r'\d+', '', item)

In [5]:
# removing punctuations, accent marks and other diacritics  
for i,item in description.iteritems():
    if type(item) != float:
        result = re.sub('[^\w\s]', '', item)
        description[i] = result

In [6]:
# removing white spaces
for i,item in description.iteritems():
    if type(item) != float:
        description[i] = item.strip()

In [7]:
# removing stop words, sparse terms, and particular words
import nltk #nltk represents natural language toolkit
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize
for index,item in description.iteritems():
    if type(item) != float:
        tokens = word_tokenize(item)
        result = [i for i in tokens if not i in stop_words]
        description[index] = result

In [8]:
# text canonicalization: stemming 词干提取
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

for i,item in description.iteritems():
    if type(item) != float:
        for index, word in enumerate(item):
            description[i][index] = stemmer.stem(word)

In [9]:
# text canonicalization: lemmatization 词形还原 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() # uses lexical knowledge bases to get the correct base forms of words.

for i,item in description.iteritems():
    if type(item) != float:
        for index, word in enumerate(item):
            description[i][index] = lemmatizer.lemmatize(word)

In [10]:
description

0     [nibbl, month, old, ball, cute, energet, play,...
1     [found, alon, yesterday, near, apart, shake, b...
2     [pregnant, mother, dump, irrespons, owner, roa...
3     [good, guard, dog, alert, activ, obedi, wait, ...
4     [handsom, yet, cute, boy, adopt, play, pal, we...
5               [stray, kitten, came, hous, feed, keep]
6     [anyon, within, area, ipoh, taip, interest, ad...
7     [siu, pak, give, birth, puppi, interest, pl, c...
8     [healthi, activ, feisti, kitten, found, neighb...
9     [manja, gentl, stray, cat, found, would, reall...
10       [seriou, adopt, plea, sm, call, detail, thank]
11    [kali, super, play, kitten, go, minut, wake, q...
12    [peanut, abus, puppi, rescu, scare, peopl, hap...
13    [hi, pet, lover, first, post, need, help, mont...
14    [lost, dog, found, bandar, menjalara, kepongta...
15    [move, apart, land, home, mani, friendli, stra...
16                               [spay, ador, friendli]
17    [she, activ, obey, wht, command, u, told, 

# Alternatively

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
data = pd.read_csv("data/train.csv")
corpus = data['Description'][0:100]

In [21]:
vectorizer = CountVectorizer(stop_words='english')

In [22]:
for i in range(len(corpus)):
    if type(corpus[i]) == float:
        corpus[i] = ""

In [23]:
X = vectorizer.fit_transform(corpus)

In [24]:
vectorizer.get_feature_names()

['10',
 '11',
 '11a',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '19',
 '1g',
 '1st',
 '23',
 '24th',
 '25',
 '2female',
 '2month',
 '2months',
 '2nd',
 '2years',
 '30mins',
 '30th',
 '45',
 '6puppies',
 '6th',
 '9th',
 '________________________________________________',
 '___________________________________________________',
 'abandoned',
 'abang',
 'able',
 'abroad',
 'absolutely',
 'abused',
 'acceptable',
 'accident',
 'act',
 'action',
 'active',
 'actually',
 'ada',
 'addopt',
 'address',
 'adjustable',
 'adopt',
 'adopted',
 'adopter',
 'adopters',
 'adopting',
 'adoption',
 'adopts',
 'adorable',
 'adult',
 'adventurous',
 'affection',
 'affectionate',
 'afraid',
 'age',
 'ago',
 'agree',
 'alert',
 'alertness',
 'alike',
 'allow',
 'allowed',
 'almond',
 'amat',
 'amer',
 'american',
 'angel',
 'ank',
 'anybody',
 'anyybody',
 'apartment',
 'apologise',
 'app',
 'appear',
 'apply',
 'appreciate',
 'apr',
 'ard',
 'area',
 'aren',
 'arranged',
 'aside',
 'ask',
 'asked',
 'asyi

In [25]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [26]:
bag_of_words = vectorizer.transform(corpus)

In [27]:
sum_words = bag_of_words.sum(axis=0) 

In [28]:
sum_words.shape

(1, 1224)

In [29]:
words_freq = [(word, sum_words[0, idx]) for word, idx in     vectorizer.vocabulary_.items()]

In [30]:
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [31]:
words_freq

[('home', 40),
 ('adoption', 33),
 ('dog', 25),
 ('good', 22),
 ('adopt', 21),
 ('cat', 21),
 ('old', 20),
 ('care', 18),
 ('puppies', 18),
 ('active', 18),
 ('interested', 16),
 ('contact', 16),
 ('puppy', 16),
 ('playful', 15),
 ('looking', 15),
 ('cute', 15),
 ('friendly', 15),
 ('just', 14),
 ('mother', 14),
 ('healthy', 14),
 ('loving', 14),
 ('male', 14),
 ('rescued', 13),
 ('adopter', 13),
 ('months', 12),
 ('pls', 12),
 ('love', 12),
 ('needs', 12),
 ('need', 12),
 ('kittens', 12),
 ('adopted', 12),
 ('cats', 11),
 ('kitten', 11),
 ('vaccination', 11),
 ('days', 11),
 ('female', 11),
 ('color', 11),
 ('new', 11),
 ('kitty', 10),
 ('dogs', 10),
 ('like', 10),
 ('house', 10),
 ('trained', 10),
 ('bull', 10),
 ('people', 9),
 ('area', 9),
 ('email', 9),
 ('family', 9),
 ('started', 9),
 ('time', 8),
 ('adorable', 8),
 ('long', 8),
 ('adopters', 8),
 ('hair', 8),
 ('food', 8),
 ('toilet', 8),
 ('breed', 8),
 ('month', 7),
 ('little', 7),
 ('owner', 7),
 ('really', 7),
 ('think', 7)