# Data Cleaning & Feature Engineering Notebook

## Environment Set-up

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd "/content/drive/MyDrive/Dsa4263/data"

/content/drive/MyDrive/Dsa4263/data


In [3]:
# loading packages
import pandas as pd
import numpy as np
import re

# nltk
!pip install nltk
import nltk
nltk.download('punkt')

# stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
english_stop_words = stopwords.words('english')

# tokenizing
from nltk.tokenize import word_tokenize

# normalizing
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# feature engineering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
df = pd.read_csv('reviews.csv', encoding= 'unicode_escape')
df.head()

Unnamed: 0,Sentiment,Time,Text
0,positive,18/6/21,This is a very healthy dog food. Good for thei...
1,positive,7/7/21,I've been very pleased with the Natural Balanc...
2,positive,18/6/21,"Before I was educated about feline nutrition, ..."
3,positive,7/7/21,"My holistic vet recommended this, along with a..."
4,positive,1/7/21,I bought this coffee because its much cheaper ...


## Data Cleaning

In [5]:
# remove underscore
def remove_underscore(text):
  return re.sub('_', ' ', text)
  
# convert to lower case
def to_lowercase(text):
  return text.lower()

# remove html tags
def remove_html(text):
  return re.sub(r'<[^>]+>', ' ', text)

# remove punctuation
def remove_punc(text):
  res = re.sub(r'\'', '', text) # apostrophe removed without splitting word
  res = re.sub(r'[^\w\s]', ' ', res)
  return res

# remove numbers
def remove_num(text):
  return re.sub(r'\w*\d\w*', ' ', text)

# remove extra spaces
def remove_whitespace(text):
  return re.sub(r' {2,}', ' ', text).strip()

# remove stopwords
def remove_stopwords(text):
  text_tokens = word_tokenize(text)
  tokens_without_sw = [word for word in text_tokens if not word in english_stop_words]
  return ' '.join(tokens_without_sw)

# stemming
def stem_text(text):
  tokens = [stemmer.stem(word) for word in text.split()]
  result = ' '.join(tokens)
  return result

def get_cleantext(text, stemming=False):
  """
  Applies all text cleaning steps on the input String. Returns clean text String.
  Stemming is skipped by default, set stemming=True to stem text
  """
  removed_extra_chars =  remove_underscore(remove_whitespace(remove_num(remove_punc(remove_html(to_lowercase(text))))))
  res = remove_stopwords(removed_extra_chars)
  if stemming:
    res = stem_text(res)
  return res

In [6]:
df['clean_text'] = df['Text'].apply(lambda x: get_cleantext(x))
#df['clean_text']  = df['clean_text'].apply(lambda x: x.split())
df['Sentiment_num'] = df.Sentiment.map({"positive": 1, "negative": 0})
df['stem_clean_text'] = df['Text'].apply(lambda x: get_cleantext(x, stemming=True))

In [7]:
df

Unnamed: 0,Sentiment,Time,Text,clean_text,Sentiment_num,stem_clean_text
0,positive,18/6/21,This is a very healthy dog food. Good for thei...,healthy dog food good digestion also good smal...,1,healthi dog food good digest also good small p...
1,positive,7/7/21,I've been very pleased with the Natural Balanc...,ive pleased natural balance dog food dogs issu...,1,ive pleas natur balanc dog food dog issu dog f...
2,positive,18/6/21,"Before I was educated about feline nutrition, ...",educated feline nutrition allowed cats become ...,1,educ felin nutrit allow cat becom addict dri c...
3,positive,7/7/21,"My holistic vet recommended this, along with a...",holistic vet recommended along brands tried ca...,1,holist vet recommend along brand tri cat prefe...
4,positive,1/7/21,I bought this coffee because its much cheaper ...,bought coffee much cheaper ganocafe organic re...,1,bought coffe much cheaper ganocaf organ reishi...
...,...,...,...,...,...,...
5439,negative,26/2/21,"This is an okay gift box, only if you like med...",okay gift box like mediocre cheese summer saus...,0,okay gift box like mediocr chees summer sausag...
5440,negative,18/12/19,It looks llike I just walked into a raw deal. ...,looks llike walked raw deal item intolerably s...,0,look llike walk raw deal item intoler stale re...
5441,negative,19/1/20,Thank god that i tasted the metal before i swa...,thank god tasted metal swallowed dont even get...,0,thank god tast metal swallow dont even get got...
5442,negative,13/9/20,This product was very good when I began buying...,product good began buying lately terrible tast...,0,product good began buy late terribl tast that ...


## Feature Engineering

### BoW


In [8]:
def bow(X, ngram_range=(1, 1)):
    """
    ngram_range is set to (1,1) in default to extract only individual words (unigrams)
    can change to (2,2) for bigrams or (1,2) for both ungrams and bigrams
    """
    # Create an instance of the CountVectorizer class
    vectorizer = CountVectorizer(ngram_range=ngram_range)

    # Fit the vectorizer on the text data and transform it into a matrix
    bow_matrix = vectorizer.fit_transform(X)

    X = bow_matrix.toarray()

    return X


### TF-IDF


In [9]:
def tf_idf(X):

    # Create an instance of the TfidfVectorizer class, can modify its parameters such as ngram
    # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    vectorizer = TfidfVectorizer()

    # Fit the vectorizer on the text data and transform it into a matrix
    matrix = vectorizer.fit_transform(X)
    X = matrix.toarray()

    return X

### word2vec

In [10]:
# use pre-trained word2vec model
import gensim.downloader as api
#wv = api.load('word2vec-google-news-300')
#wv.save('/content/drive/MyDrive/Dsa4263/vectors.kv')
from gensim.models import KeyedVectors # shared in drive or could uncomment above lines
wv = KeyedVectors.load('/content/drive/MyDrive/Dsa4263/vectors.kv')

In [40]:
def get_mean_vector(text, wv):
  """
  numerical representation for the sentence = mean(words in the sentence)
  """
  vector_size = wv.vector_size
  wv_res = np.zeros(vector_size)
  ctr = 0
  for w in text:
    if w in wv:
      ctr += 1
      wv_res += wv[w]
  if ctr == 0:
    return wv_res
  else:
    wv_res = wv_res/ctr
    return wv_res

def word2vec(X):
  x_split = list(map(lambda x: x.split(),X))
  full_list = list(map(lambda text: get_mean_vector(text,wv), x_split))
  X = np.array(full_list)
  


  return X

## XGBoost


In [41]:
from sklearn.model_selection import train_test_split
import xgboost

model = xgboost.XGBRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.5,
    colsample_bytree=0.5,
    random_state=42
)


In [42]:
model.fit(X, y)

In [58]:
X = df['clean_text'].to_list()
y = df['Sentiment_num'].values
X = word2vec(X)

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size = 0.2, stratify = y, random_state = 4263
)

In [71]:

# create a sample dataframe
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

# extract column 'B' and convert to numpy array
col_B = df['B'].values

type(col_B)

numpy.ndarray

In [60]:
model.fit(X_train, y_train)

In [67]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_prob = model.predict(X_test)
threshold = 0.5  # set the threshold value
y_pred = (y_pred_prob > threshold).astype(int) 

In [72]:
type(y_pred)

numpy.ndarray

In [69]:
accuracy_score(y_test, y_pred)

0.8347107438016529

In [None]:
# Hyperparameters for optimization
params = {
    "learning_rate":[0.001, 0.01, 0.1, 1],
    "max_depth":[3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight" :[1, 3, 5, 7],
    "gamma":[0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree":[0.3, 0,4, 0.5, 0.7]

}

In [None]:
# Hyperparameter optimization using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost

In [None]:
classifier = xgboost.XGBClassifier()



In [None]:
random_search = RandomizedSearchCV(classifier, param_distributions = params, n_iter = 10 ,scoring = "accuracy", verbose = 3)
random_search.fit(X,y)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END colsample_bytree=0, gamma=0.2, learning_rate=1, max_depth=3, min_child_weight=1;, score=0.728 total time=  28.2s
[CV 2/5] END colsample_bytree=0, gamma=0.2, learning_rate=1, max_depth=3, min_child_weight=1;, score=0.734 total time=  18.7s
[CV 3/5] END colsample_bytree=0, gamma=0.2, learning_rate=1, max_depth=3, min_child_weight=1;, score=0.736 total time=  21.9s
[CV 4/5] END colsample_bytree=0, gamma=0.2, learning_rate=1, max_depth=3, min_child_weight=1;, score=0.732 total time=  23.5s
[CV 5/5] END colsample_bytree=0, gamma=0.2, learning_rate=1, max_depth=3, min_child_weight=1;, score=0.739 total time=  19.4s
[CV 1/5] END colsample_bytree=4, gamma=0.0, learning_rate=0.1, max_depth=6, min_child_weight=3;, score=nan total time=   1.3s
[CV 2/5] END colsample_bytree=4, gamma=0.0, learning_rate=0.1, max_depth=6, min_child_weight=3;, score=nan total time=   1.3s
[CV 3/5] END colsample_bytree=4, gamma=0.0, learning_rate

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.9/dist-packages/xgboost/sklearn.py", line 1490, in fit
    self._Booster = train(
  File "/usr/local/lib/python3.9/dist-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.9/dist-packages/xgboost/training.py", line 185, in t

In [None]:
random_search.best_estimator_

In [None]:
random_search.best_params_

{'min_child_weight': 1,
 'max_depth': 15,
 'learning_rate': 1,
 'gamma': 0.0,
 'colsample_bytree': 0.7}