# Text manipulation

Hello everyone! For this section, we will be learning how to manipulate text data using `TextBlob` and `Scikit-learn`. In particular, we will be using these packages to clean, format, and transform our text data into simpler text and vector representations. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from textblob import TextBlob as tb
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
stopwords = stopwords.words('english')

In [None]:
# Read our tweets from the previously created CSV
tweets = pd.read_csv('tweets.csv', index_col=None, header=0)
tweets.head()

In [None]:
def clean_tweets(tweets):
    """
    Replaces empty tweets, replaces text with lower case characters,
    remove special characters and RTs, remove leading and trailing
    whitespaces, and remove stopwords.
    """
    tweets['cleaned_text'] = tweets['text'].fillna('')
    tweets['cleaned_text'] = tweets['cleaned_text'].str.lower()
    tweets['cleaned_text'] = tweets['cleaned_text'].str.replace(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|rt', '')
    tweets['cleaned_text'] = tweets['cleaned_text'].str.replace(r'^\s+|\s+$', '') 
    tweets['cleaned_text'] = tweets['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if w not in (stopwords)]))
    return tweets

In [None]:
# Clean tweets
cleaned_tweets = clean_tweets(tweets)
cleaned_tweets.head()

In [None]:
# Export the cleaned tweets into CSV
cleaned_tweets.to_csv('cleaned_tweets.csv', index=False)

In [None]:
def tweets_to_dtm(tweets):
    tweets = tweets['cleaned_text']
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(tweets)
    return dtm, vectorizer

def tweets_to_ngram(tweets, n=2):
    tweets = tweets['cleaned_text']
    vectorizer = CountVectorizer(
        ngram_range=(n, n),
        token_pattern=r'\b\w+\b',
        min_df=1)
    dtm = vectorizer.fit_transform(tweets)
    return dtm, vectorizer

def tweets_to_tfidf(tweets):
    tweets = tweets['cleaned_text']
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(tweets)
    return tfidf, vectorizer

In [None]:
# Get document-term matrix
dtm, dtm_v = tweets_to_dtm(cleaned_tweets)
dtm.toarray().shape

In [None]:
list(dtm_v.vocabulary_.items())[0:5]

In [None]:
# Get bigram matrix
bigram, ngram_v = tweets_to_ngram(cleaned_tweets, n=2)
bigram.toarray().shape

In [None]:
list(ngram_v.vocabulary_.items())[0:5]

In [None]:
# Get TFIDF matrix
tfidf, tfidf_v = tweets_to_tfidf(cleaned_tweets)
tfidf.toarray().shape

In [None]:
list(tfidf_v.vocabulary_.items())[0:5]