In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
from nltk.corpus import words, stopwords

import pickle

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
DATA_PATH = '../data/2_preprocessed_data/'

## Chargement des données

In [4]:
bad_reviews = pd.read_csv(DATA_PATH + 'bad_reviews',
                          dtype={
                                 "review_id": str,
                                 "user_id": str,
                                 "business_id": str,
                                 "stars": 'uint8',
                                 "useful": 'int16',
                                 "funny": 'int16',
                                 "cool": 'int16',
                                 "text": str,
                                } 
                         )

In [5]:
good_reviews = pd.read_csv(DATA_PATH + 'good_reviews',
                          dtype={
                                 "review_id": str,
                                 "user_id": str,
                                 "business_id": str,
                                 "stars": 'uint8',
                                 "useful": 'int16',
                                 "funny": 'int16',
                                 "cool": 'int16',
                                 "text": str,
                                } 
                         )

In [6]:
sample_reviews = pd.read_csv(DATA_PATH + 'sample_reviews',
                          dtype={
                                 "review_id": str,
                                 "user_id": str,
                                 "business_id": str,
                                 "stars": 'uint8',
                                 "useful": 'int16',
                                 "funny": 'int16',
                                 "cool": 'int16',
                                 "text": str,
                                } 
                         )

## Corpus

In [7]:
corpus_bad    = ". ".join(list(bad_reviews.text))
corpus_good   = ". ".join(list(good_reviews.text))
corpus_sample = ". ".join(list(sample_reviews.text))

## Cleaning function

In [8]:
def preprocess_text(doc,
                 stop_words,
                 rejoin=False,
                 lem_or_stem="stem",
                 rare_words=None,
                 min_len_word=3,
                 force_is_alpha=True,
                 context=None,
                 english_words=None):
    """Process a text with selection of english words
    positionnal args :
    -------------------
    doc: str: the document to process
    
    optional args :
    ----------------
    rejoin: bool: if False return a list of words, else return the string of joined element of the list
    lem_or_stem : choice between lemmatize and stemmatize
    rare_words: list : list of rare words to exclude
    force_is_alpha: int: if 1 exclude all tokens with a numeric character
    english_words: list: list of english words accepted
    
    return:
    --------
    List of tokens if rejoin=False, the joined list if True.
    """
    if not rare_words:
        rare_words=[]
    
    doc = doc.lower().strip()
    
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    cleaned_tokens_list = [word for word in raw_tokens_list if word not in stop_words]
    
    non_rare_tokens = [word for word in cleaned_tokens_list if word not in rare_words]
    
    large_words = [word for word in non_rare_tokens if len(word) >= min_len_word]
    
    if force_is_alpha:
        alpha_tokens = [word for word in large_words if word.isalpha()]
    else:
        alpha_tokens = large_words
        
    if lem_or_stem == "lem":
        trans = WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_tokens]
    else:
        trans = PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_tokens]
        
    if context:
        new_text = [word for word in trans_text if word not in context]
    else:
        new_text = trans_text
        
    if english_words:
        english_text = [i for i in new_text if i in english_words]
    else:
        english_text = new_text
        
    if rejoin:
        return " ".join(english_text)
    else:
        return english_text

In [9]:
def display_tokens_info(tokens):
    """Display info about corpus"""
    print(f"Nb tokens : {len(tokens)}")
    print(f"Nb tokens uniques : {len(set(tokens))}")
    print(tokens[:30])

## Process corpus

In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
corpus_bad_tokens = preprocess_text(corpus_bad, stop_words=stop_words)

In [12]:
display_tokens_info(corpus_bad_tokens)

Nb tokens : 368787
Nb tokens uniques : 11543
['late', 'post', 'loufest', 'group', 'decid', 'nourish', 'delici', 'greasi', 'food', 'bed', 'courtesi', 'diner', 'next', 'door', 'pack', 'opt', 'steak', 'shake', 'amaz', 'cheap', 'serious', 'everyth', 'like', 'littl', 'nicer', 'mcdonald', 'imo', 'got', 'frisco', 'melt']


In [13]:
corpus_good_tokens = preprocess_text(corpus_good, stop_words=stop_words)

In [14]:
display_tokens_info(corpus_good_tokens)

Nb tokens : 299513
Nb tokens uniques : 11642
['mickey', 'sweetheart', 'fun', 'spirit', 'sandwich', 'best', 'huge', 'sandwhich', 'critic', 'got', 'italian', 'wait', 'tri', 'london', 'broil', 'next', 'best', 'vegan', 'place', 'area', 'alway', 'come', 'new', 'stuff', 'keep', 'around', 'classic', 'know', 'anyth', 'tri']


In [15]:
corpus_sample_tokens = preprocess_text(corpus_sample, stop_words=stop_words)

In [16]:
display_tokens_info(corpus_sample_tokens)

Nb tokens : 1072372
Nb tokens uniques : 21825
['late', 'post', 'loufest', 'group', 'decid', 'nourish', 'delici', 'greasi', 'food', 'bed', 'courtesi', 'diner', 'next', 'door', 'pack', 'opt', 'steak', 'shake', 'amaz', 'cheap', 'serious', 'everyth', 'like', 'littl', 'nicer', 'mcdonald', 'imo', 'got', 'frisco', 'melt']


## Rare words

In [17]:
word_counts = pd.Series(corpus_sample_tokens).value_counts()
word_counts

food           16511
order          12392
place          11935
good           11897
time            8899
               ...  
perfecta           1
combinación        1
panni              1
willamett          1
flint              1
Length: 21825, dtype: int64

In [18]:
unique_words = word_counts[word_counts==1]

In [19]:
len(unique_words)

9215

In [20]:
unique_words = list(unique_words.index)

In [21]:
less_5_words = list(word_counts[word_counts<5].index)

In [22]:
len(less_5_words)

14494

In [23]:
less_10_words = list(word_counts[word_counts<10].index)

In [24]:
len(less_10_words)

16789

## Without Rare words

In [25]:
corpus_bad_tokens_2 = preprocess_text(corpus_bad,
                            stop_words=stop_words,
                            rare_words=less_5_words)

In [26]:
corpus_good_tokens_2 = preprocess_text(corpus_good,
                             stop_words=stop_words,
                             rare_words=less_5_words)

In [27]:
corpus_sample_tokens_2 = preprocess_text(corpus_sample,
                               stop_words=stop_words,
                               rare_words=less_5_words)

In [28]:
display_tokens_info(corpus_bad_tokens_2)

Nb tokens : 362250
Nb tokens uniques : 8902
['late', 'post', 'group', 'decid', 'nourish', 'delici', 'greasi', 'food', 'bed', 'courtesi', 'diner', 'next', 'door', 'pack', 'opt', 'steak', 'shake', 'amaz', 'cheap', 'serious', 'everyth', 'like', 'littl', 'nicer', 'mcdonald', 'imo', 'got', 'frisco', 'melt', 'cheap']


In [29]:
display_tokens_info(corpus_good_tokens_2)

Nb tokens : 289429
Nb tokens uniques : 8485
['mickey', 'sweetheart', 'fun', 'spirit', 'sandwich', 'best', 'huge', 'sandwhich', 'critic', 'got', 'italian', 'wait', 'tri', 'london', 'broil', 'next', 'best', 'vegan', 'place', 'area', 'alway', 'come', 'new', 'stuff', 'keep', 'around', 'classic', 'know', 'anyth', 'tri']


In [30]:
display_tokens_info(corpus_sample_tokens_2)

Nb tokens : 1043943
Nb tokens uniques : 13735
['late', 'post', 'group', 'decid', 'nourish', 'delici', 'greasi', 'food', 'bed', 'courtesi', 'diner', 'next', 'door', 'pack', 'opt', 'steak', 'shake', 'amaz', 'cheap', 'serious', 'everyth', 'like', 'littl', 'nicer', 'mcdonald', 'imo', 'got', 'frisco', 'melt', 'cheap']


### Supprimer le contexte

In [32]:
n = 20
context_words = [i for i in pd.Series(corpus_bad_tokens_2).value_counts().head(n).index
            if i in pd.Series(corpus_good_tokens_2).value_counts().head(n).index]
context_words

['order',
 'food',
 'time',
 'get',
 'place',
 'one',
 'like',
 'servic',
 'good',
 'back']

In [34]:
corpus_bad_tokens_3 = preprocess_text(corpus_bad,
                            stop_words=stop_words,
                            rare_words=less_5_words,
                            context=context_words)

In [35]:
corpus_good_tokens_3 = preprocess_text(corpus_good,
                             stop_words=stop_words,
                             rare_words=less_5_words,
                             context=context_words)

In [36]:
corpus_sample_tokens_3 = preprocess_text(corpus_sample,
                               stop_words=stop_words,
                               rare_words=less_5_words,
                               context=context_words)

## Export

Les traitements étant longs, on sauvegarde les corpus.

In [40]:
RESULTS_PATH = '../data/2_preprocessed_data/'

In [41]:
with open(RESULTS_PATH + "corpus_bad_tokens", "wb") as fp:
    pickle.dump(corpus_bad_tokens_2, fp)
    
with open(RESULTS_PATH + "corpus_good_tokens", "wb") as fp:
    pickle.dump(corpus_good_tokens_2, fp)
    
with open(RESULTS_PATH + "corpus_sample_tokens", "wb") as fp:
    pickle.dump(corpus_sample_tokens_2, fp)

In [43]:
with open(RESULTS_PATH + "corpus_bad_without_context", "wb") as fp:
    pickle.dump(corpus_bad_tokens_3, fp)
    
with open(RESULTS_PATH + "corpus_good_without_context", "wb") as fp:
    pickle.dump(corpus_good_tokens_3, fp)
    
with open(RESULTS_PATH + "corpus_sample_without_context", "wb") as fp:
    pickle.dump(corpus_sample_tokens_3, fp)