## Data Preparation
***
To prepare my data for modeling I:
- Reorganized the image files.
- Imported my training data, standardizing aspect ratio.
- Applied scaling to the training data for normalization.
- Imported validation data to be used test the model's performance.
- Created a specialized dataset specifically formatted for use with the ResNet50 classifier.

In [1]:
import sys
import os

# Path to the directory where your Jupyter Notebook is located
notebook_dir = '/Users/ronlodetti/Documents/Flatiron/capstone/airline_sentiment_analysis/hidden'

# Get the parent directory (where your `src` directory is located)
parent_dir = os.path.dirname(notebook_dir)

# Add the parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Now you can import your module
from src import data_understanding as du
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from src import data_prep as dp

In [4]:
df = pd.read_csv('../data/Airline_review.csv')[['Review_Title', 'Review', 'Recommended']]
X = df['Review_Title'] + ' ' + df['Review']
y = df['Recommended'].map({'yes': 1, 'no': 0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

### Bag of Words Prep

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.pipeline import Pipeline
bow_text_cleaner = dp.TextCleanerTransformer(stop_words=None, lemmatize=True)
tf_idf = TfidfVectorizer(
    decode_error='replace', 
    strip_accents='unicode', 
    stop_words=None, 
    ngram_range=(1, 2), 
    max_df=0.95, 
    min_df=2)
k_best = SelectKBest(k=20000)
to_dense_transformer = FunctionTransformer(lambda x: np.asarray(x.todense()), accept_sparse=True)


bow_pipe = Pipeline([
    ('text_cleaner',bow_text_cleaner),
    ("tf_idf", tf_idf),
    ('feature_selection', k_best),
    ('to_dense',to_dense_transformer)])

X_train_clean_bow = bow_pipe.fit_transform(X_train,y_train)
X_test_clean_bow = bow_pipe.transform(X_test)

In [17]:
X_train_clean_bow.shape

(18536, 20000)

In [12]:
from tensorflow.keras.layers import TextVectorization
sequence_text_cleaner = dp.TextCleanerTransformer(stop_words=None, lemmatize=False)
text_vectorization = TextVectorization(
    standardize=None,
    max_tokens=20000,
    output_mode='int',
    output_sequence_length=200)

2024-04-07 15:18:08.290708: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
X_train_clean = sequence_text_cleaner.fit_transform(X_train,y_train)
X_test_clean = sequence_text_cleaner.transform(X_test)

In [14]:
text_vectorization.adapt(X_train_clean)
X_train_clean_seq = text_vectorization(X_train_clean)
X_test_clean_seq = text_vectorization(X_test_clean)