In [1]:
!pwd

/Users/ronlodetti/Documents/Flatiron/capstone/airline_sentiment_analysis/hidden


In [None]:
import pandas as pd
from nltk.corpus import stopwords
import string
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
import src.code
%load_ext autoreload
%autoreload 2

In [None]:
df = pd.read_csv('../data/Airline_review.csv')[['Review_Title','Review','Recommended']]

# Counting number of words and characters in original review
df['word_count'] = df['Review'].apply(lambda x : len(x.split()))
df['char_count'] = df['Review'].apply(lambda x : len(x))

# Process and add the review titles after removing quotes
df['review_title']= df['Review_Title'].apply(lambda x : x.replace('"', ''))

# Combine review titles and content, then clean and tokenize
combined_text = df['review_title'] + ' ' + df['Review']
df['tokens'] = combined_text.apply(clean_tokenize)

# Join the tokens back into a cleaned review
df['clean_review'] = df['tokens'].apply(lambda x: ' '.join(x))

df.rename(columns={'Recommended': 'recommended','Review':'review'}, inplace=True)

df = df[['review','clean_review','tokens','word_count','char_count','recommended']].copy()

In [None]:
df.head()

In [None]:
df.info()

# Descriptive Analysis
- Ratio of "yes" to "no"
- Word and character count
- Most common words & phrases
- vectorization (e.g., TF-IDF, word embeddings

### Ratio of "Yes" to "No"

In [None]:
df['recommended'].value_counts(normalize=True)

### Word and Character Length

In [None]:
sns.histplot(data=df, x='word_count')

In [None]:
sns.histplot(data=df, x='char_count')

In [None]:
word_count_description = df.groupby('recommended')['word_count'].describe().T
word_count_description.columns = [f'{col}_word_count' for col in word_count_description.columns]
word_count_description

In [None]:
char_count_description = df.groupby('recommended')['char_count'].describe().T
char_count_description.columns = [f'{col}_char_count' for col in char_count_description.columns]
char_count_description

### Most common words & phrases

In [None]:
yes_tokens = list(df.loc[df['recommended'] == 'yes', 'tokens'].explode())
no_tokens = list(df.loc[df['recommended'] == 'no', 'tokens'].explode())

In [None]:
from  nltk import FreqDist
freqdist = FreqDist(yes_tokens)
yes_common = freqdist.most_common(25)
yes_common

In [None]:
freqdist = FreqDist(no_tokens)
no_common = freqdist.most_common(25)
no_common

In [None]:
common_yes_words = [i[0] for i in yes_common]
common_no_words = [i[0] for i in no_common]

In [None]:
for i in common_yes_words:
    if i not in common_no_words:
        print(i)

In [None]:
for i in common_no_words:
    if i not in common_yes_words:
        print(i)

Do we remove stop words or not?
Do we stem or lemmatize our text data, or leave the words as is?
Is basic tokenization enough, or do we need to support special edge cases through the use of regex?
Do we use the entire vocabulary, or just limit the model to a subset of the most frequently used words? If so, how many?
Do we engineer other features, such as bigrams, or POS tags, or Mutual Information Scores?
What sort of vectorization should we use in our model? Boolean Vectorization? Count Vectorization? TF-IDF? More advanced vectorization strategies such as Word2Vec?

In [None]:
# for count vectorizatioin
def count_vectorize(tokenized_song):
    unique_words = set(tokenized_song)

    song_dict = {word:0 for word in unique_words}

    for word in tokenized_song:
        song_dict[word] += 1

    return song_dict

In [None]:
#tf_idf
def inverse_document_frequency(list_of_token_songs):
    num_docs = len(list_of_token_songs)
    
    unique_words = set([item for sublist in list_of_token_songs for item in sublist])
    # Same as:
    # unique_words = set()
    # for song in list_of_dicts:
    #     for word in song.keys():
    #         unique_words.add(word)
            
    inv_doc_freq = {word:0 for word in unique_words}

    for word in unique_words:
        num_docs_with_word = 0
        for song_tokens in list_of_token_songs:
            if word in song_tokens:
                num_docs_with_word += 1
        inv_doc_freq[word] = np.log(num_docs / num_docs_with_word)
        
    return inv_doc_freq

def tf_idf(list_of_token_songs):
    
    unique_words = set([item for sublist in list_of_token_songs for item in sublist])
    
    idf = inverse_document_frequency(list_of_token_songs)
    
    tf_idf_list_of_dicts = []
    for song_tokens in list_of_token_songs:
        song_tf = count_vectorize(song_tokens)
        doc_tf_idf = {word:0 for word in unique_words}
        for word in unique_words:
            if word in song_tokens:
                doc_tf_idf[word] = song_tf[word] * idf[word]
            else:
                doc_tf_idf[word] = 0
        tf_idf_list_of_dicts.append(doc_tf_idf)
        
    return tf_idf_list_of_dicts

def main(filenames):

    all_songs = []
    for song in filenames:
        with open(f'data/{song}') as f:
            song_lyrics = f.readlines()
            all_songs.append(song_lyrics)
    
    all_song_tokens = []

    for song in all_songs:
        song_tokens = word_tokenize(clean_song(song))
        all_song_tokens.append(song_tokens)

    tf_idf_all_docs = tf_idf(all_song_tokens)
    return tf_idf_all_docs

tf_idf_all_docs = main(filenames)

In [None]:
#Normalized Word Frequency
total_word_count = sum(macbeth_stopped_freqdist.values())
macbeth_top_50 = macbeth_stopped_freqdist.most_common(50)
print(f'{"Word":10} Normalized Frequency')
for word in macbeth_top_50:
    normalized_frequency = word[1] / total_word_count
    print(f'{word[0]:10} {normalized_frequency:^20.4}')

In [None]:
#Bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
macbeth_finder = BigramCollocationFinder.from_words(tokens)
macbeth_scored = macbeth_finder.score_ngrams(bigram_measures.raw_freq)

In [None]:
#Pointwise Information
macbeth_pmi_finder = BigramCollocationFinder.from_words(macbeth_words_stopped)
macbeth_pmi_finder.apply_freq_filter(5)
macbeth_pmi_scored = macbeth_pmi_finder.score_ngrams(bigram_measures.pmi)
macbeth_pmi_scored

In [None]:
# doc preparer
def doc_preparer(doc, stop_words=sw):
    '''
    
    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    
    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    doc = regex_token.tokenize(doc)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
    print(doc)
    doc = pos_tag(doc)
    doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
    lemmatizer = WordNetLemmatizer() 
    doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
    return ' '.join(doc)

In [None]:
# Secondary train-test split to build our best model
X_t, X_val, y_t, y_val = train_test_split(token_docs, y_train,
                                          test_size=0.25, random_state=42)
cv = CountVectorizer(max_features=5)

X_t_vec = cv.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

Load the Data
Use pandas and sklearn.datasets to load the train and test data into appropriate data structures. Then get a sense of what is in this dataset by visually inspecting some samples.

2. Perform Data Cleaning and Exploratory Data Analysis with nltk
Standardize the case of the data and use a tokenizer to convert the full posts into lists of individual words. Then compare the raw word frequency distributions of each category.

3. Build and Evaluate a Baseline Model with TfidfVectorizer and MultinomialNB
Ultimately all data must be in numeric form in order to be able to fit a scikit-learn model. So we'll use a tool from sklearn.feature_extraction.text to convert all data into a vectorized format.

Initially we'll keep all of the default parameters for both the vectorizer and the model, in order to develop a baseline score.

4. Iteratively Perform and Evaluate Preprocessing and Feature Engineering Techniques
Here you will investigate three techniques, to determine whether they should be part of our final modeling process:

Removing stopwords
Using custom tokens
Domain-specific feature engineering
Increasing max_features
5. Evaluate a Final Model on the Test Set
Once you have chosen a final modeling process, fit it on the full training data and evaluate it on the test data.

In [None]:
from nltk.tokenize import RegexpTokenizer

basic_token_pattern = r"(?u)\b\w\w+\b"

tokenizer = RegexpTokenizer(basic_token_pattern)
tokenizer.tokenize(politics_sample)[:10]

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

def visualize_top_10(freq_dist, title):

    # Extract data for plotting
    top_10 = list(zip(*freq_dist.most_common(10)))
    tokens = top_10[0]
    counts = top_10[1]

    # Set up plot and plot data
    fig, ax = plt.subplots()
    ax.bar(tokens, counts)

    # Customize plot appearance
    ax.set_title(title)
    ax.set_ylabel("Count")
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.tick_params(axis="x", rotation=90)
    
visualize_top_10(example_freq_dist, "Top 10 Word Frequency for Example Tokens")

In [None]:
sample_freq_dist = FreqDist(train_sample["text_tokenized"].explode())
visualize_top_10(sample_freq_dist, "Top 10 Word Frequency for 5 Samples")

In [None]:

# Import the relevant vectorizer class
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate a vectorizer with max_features=10
# (we are using the default token pattern)
tfidf = TfidfVectorizer(max_features=10)

# Fit the vectorizer on X_train["text"] and transform it
X_train_vectorized = tfidf.fit_transform(X_train["text"])

# Visually inspect the vectorized data
pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names_out())

In [None]:

# Import relevant class and function
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

# Instantiate a MultinomialNB classifier
baseline_model = MultinomialNB()

# Evaluate the classifier on X_train_vectorized and y_train
baseline_cv = cross_val_score(baseline_model, X_train_vectorized, y_train)
baseline_cv

In [None]:
def remove_stopwords(token_list):
    """
    Given a list of tokens, return a list where the tokens
    that are also present in stopwords_list have been
    removed
    """
    stopwords_removed = [token for token in token_list if token not in stopwords_list]
    return stopwords_removed

In [None]:
# Instantiate the vectorizer
tfidf = TfidfVectorizer(
    max_features=10,
    stop_words=stopwords_list
)

# Fit the vectorizer on X_train["text"] and transform it
X_train_vectorized = tfidf.fit_transform(X_train["text"])

# Visually inspect the vectorized data
pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names_out())


In [None]:
print("Baseline:         ", baseline_cv.mean())
print("Stopwords removed:", stopwords_removed_cv.mean())
Baseline:          0.4013364135429863
Stopwords removed: 0.41756464714211183

In [None]:
# num sentence
from nltk.tokenize import sent_tokenize

sent_tokenize(X_train.iloc[100]["text"])
