In [1]:
!pip install texthero

Collecting texthero
  Downloading https://files.pythonhosted.org/packages/1f/5a/a9d33b799fe53011de79d140ad6d86c440a2da1ae8a7b24e851ee2f8bde8/texthero-1.0.9-py3-none-any.whl
Collecting nltk>=3.3
[?25l  Downloading https://files.pythonhosted.org/packages/5e/37/9532ddd4b1bbb619333d5708aaad9bf1742f051a664c3c6fa6632a105fd8/nltk-3.6.2-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 11.6MB/s 
Collecting unidecode>=1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 40.8MB/s 
Installing collected packages: nltk, unidecode, texthero
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.6.2 texthero-1.0.9 unidecode-1.2.0


In [6]:
import texthero as hero
import pandas as pd

In [16]:
text="Hello World!      It's a lovely day. It's 2:30"
series=pd.Series(text)


In [12]:
series

0    Hello World! It's a lovely day. It's 2:30
dtype: object

In [17]:
# Remove DIGITS
print(hero.remove_digits(series))

0    Hello World!      It's a lovely day. It's  : 
dtype: object


In [18]:
# Remove punctuations
hero.remove_punctuation(series)

0    Hello World       It s a lovely day  It s 2 30
dtype: object

In [19]:
# Remove Brackets
hero.remove_brackets(series)

0    Hello World!      It's a lovely day. It's 2:30
dtype: object

In [20]:
# Remove whitespaces
hero.remove_whitespace(series)

0    Hello World! It's a lovely day. It's 2:30
dtype: object

In [21]:
# Stopwords
hero.remove_stopwords(series)

0    Hello World!      It'  lovely day. It' 2:30
dtype: object

In [22]:
# All preprocessing
hero.clean(series)

0    hello world lovely day
dtype: object

**EXAMPLE**

In [23]:
df = pd.read_csv("https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv")
df.head()

Unnamed: 0,text,topic
0,Claxton hunting first major medal\n\nBritish h...,athletics
1,O'Sullivan could run in Worlds\n\nSonia O'Sull...,athletics
2,Greene sets sights on world title\n\nMaurice G...,athletics
3,IAAF launches fight against drugs\n\nThe IAAF ...,athletics
4,"Dibaba breaks 5,000m world record\n\nEthiopia'...",athletics


In [24]:
# Pipelining
df['pca'] = (df['text'].pipe(hero.clean)
                       .pipe(hero.tfidf)###vectorizing
                       .pipe(hero.pca)
            )

In [26]:
df.head()

Unnamed: 0,text,topic,pca
0,Claxton hunting first major medal\n\nBritish h...,athletics,"[-0.0910334074574728, 0.10350070748170347]"
1,O'Sullivan could run in Worlds\n\nSonia O'Sull...,athletics,"[-0.00042734981960925383, 0.024815010736742884]"
2,Greene sets sights on world title\n\nMaurice G...,athletics,"[-0.1176181352852282, 0.12872105684967863]"
3,IAAF launches fight against drugs\n\nThe IAAF ...,athletics,"[-0.09137951393185792, 0.1540154027612948]"
4,"Dibaba breaks 5,000m world record\n\nEthiopia'...",athletics,"[-0.0912887362740055, 0.13499639793867965]"


In [25]:
# Visualizing
hero.scatterplot(df, 'pca', color='topic', title="PCA BBC Sport news")

In [27]:
df = pd.read_csv("https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv")

df['tfidf'] = (df['text'].pipe(hero.clean)
                         .pipe(hero.tfidf)
              )

# Kmeans
df['kmeans_labels'] = (df['tfidf'].pipe(hero.kmeans, n_clusters=5)
                                  .astype(str)
                      )

df['pca'] = df['tfidf'].pipe(hero.pca)

df.head()

Unnamed: 0,text,topic,tfidf,kmeans_labels,pca
0,Claxton hunting first major medal\n\nBritish h...,athletics,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,"[-0.0911593774447469, 0.10345322152487348]"
1,O'Sullivan could run in Worlds\n\nSonia O'Sull...,athletics,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,"[-0.0003157403280008015, 0.02494102027850467]"
2,Greene sets sights on world title\n\nMaurice G...,athletics,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0533678197008...",2,"[-0.11768140287999865, 0.12864372053175632]"
3,IAAF launches fight against drugs\n\nThe IAAF ...,athletics,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4,"[-0.09130415758034965, 0.15400996830449803]"
4,"Dibaba breaks 5,000m world record\n\nEthiopia'...",athletics,"[0.24734311047947527, 0.0, 0.0, 0.0, 0.0, 0.0,...",2,"[-0.09132734350751462, 0.13501836583864646]"


In [28]:
hero.scatterplot(df, 'pca', color='kmeans_labels', title="K-means BBC Sport news")