#### Text Representation 

In [1]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('reviews.csv')

##### Basic Text Preprocessing

In [3]:
## remove html tag
import re 
import spacy
from nltk.stem.porter import PorterStemmer
nlp = spacy.load('en_core_web_sm')
from nltk.stem.porter import PorterStemmer

import string
def remove_html_tag(text):
    pattern = re.compile('<..*?>')
    return pattern.sub(r'', text)

def remove_url(text):
    pattern = re.compile('https?://\s+www\.\s+')
    return pattern.sub(r'', text)    

def remove_punctuation(text) : 
    exclude = string.punctuation
    return text.translate(str.maketrans('', '', exclude))
    
def correct_spelling(text):
    return str(TextBlob(text).correct())

    
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def tokenise(text):
    return [token.text for token in nlp(text)]

ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def lemmatize_sentence(sentence):
    wordnet_lemmatizer = WordNetLemmatizer()
    punctuations = "?:!.,;"

    sentence_words = word_tokenize(sentence)

    sentence_words = [word for word in sentence_words if word not in punctuations]
    
    lemmatized_words = [(word, wordnet_lemmatizer.lemmatize(word, pos='v')) for word in sentence_words]

    print("{0:20}{1:20}".format("Word", "Lemma"))
    for word, lemma in lemmatized_words:
        print("{0:20}{1:20}".format(word, lemma))

In [4]:
df['review'] = df['review'].apply(remove_html_tag)
df['review'] = df['review'].apply(remove_punctuation)
df['review'] = df['review'].apply(remove_emoji)
df['review'] = df['review'].str.lower()

##### One Hot Encoding ######


In [5]:
encoder = OneHotEncoder(sparse_output = False, drop='first')

new_array = encoder.fit_transform(df[['review']])

In [6]:
new_df = pd.DataFrame(new_array)

In [7]:
new_df.shape

(50000, 49579)

##### Bag of Words

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
vector_bag = CountVectorizer()

In [10]:
new_array_2 = vector_bag.fit_transform(df['review'])

In [11]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(new_array_2)

In [12]:
new_df_2 = pd.DataFrame(X_reduced)

In [13]:
new_df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,23.755996,-2.233298,-1.06215,3.155375,-1.225857,-0.229157,-0.91114,-2.189572,-1.14783,-1.815437,...,-0.01107,0.315686,-0.826967,0.402342,-0.576245,0.110998,0.766842,0.785078,0.007232,-0.407543
1,17.903649,4.507064,0.43998,-1.877099,-3.492106,-1.463537,-2.345386,-1.237719,-2.866922,-0.517796,...,-0.14264,-0.384211,-0.520074,0.309011,0.289558,0.200315,-0.600555,0.108689,0.342573,0.407225
2,13.329773,-1.492844,-0.47198,0.141239,0.169426,0.059844,1.79468,0.045424,0.895415,-0.183727,...,-0.037936,-0.176532,-0.393913,0.50153,0.170796,-0.154089,-0.840033,0.688515,-0.722161,0.442541
3,9.683612,-0.583862,1.407908,-0.472852,-0.008746,-0.815297,-0.33194,1.546669,-0.26611,1.32853,...,0.470967,0.594937,0.150265,0.062508,0.743831,0.654234,-0.194971,0.074629,0.323258,0.44614
4,22.833998,3.932505,0.758755,3.244718,-0.175937,-1.858452,-1.273997,1.320844,1.756249,1.234114,...,-0.194423,-0.301579,-0.346838,-0.151163,-0.216361,0.101837,-0.449467,0.108685,0.144174,-0.768871


##### N Gram

In [14]:
vector_bag2 = CountVectorizer(ngram_range = (1, 3))

In [15]:
new_array_3 = vector_bag2.fit_transform(df['review'])

In [16]:
new_array_3.shape

(50000, 9523452)

In [17]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(new_array_2)

In [19]:
X_reduced.shape

(50000, 100)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
idf = TfidfVectorizer()

In [21]:
new_array_4 = idf.fit_transform(df['review'])

In [22]:
new_array_4.shape

(50000, 221441)