In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from torchtext.data.utils import get_tokenizer
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
reviews = pd.read_csv("data/customer_review_text.csv")
reviews.head()


This data is extremely messy. While the data is complete, all columns [Text, Sentiment, Source, Date Time, User ID, Location, and Conf. score] are all within the same column. We will use regexp to separate them into different columns.

In [None]:
reviews.iloc[0,0]

In [None]:
reviews.shape

In [None]:
for i in range(5):
    print(reviews.iloc[i,0])

First, rename the column for ease of use later in the code.

In [None]:
col = reviews.columns[0]
col

In [None]:
df_clean = pd.DataFrame({
   "text": reviews[col].str.extract(r'"([^"]+)"', expand=False),
   "sentiment": reviews[col].str.extract(r'(Positive|Negative)', expand=False),
   "source": reviews[col].str.extract(r'\S+, \S+, (\S+),', expand=False),
   "date": reviews[col].str.extract(r'(\d{4}-\d{2}-\d{2})', expand=False),
   "time": reviews[col].str.extract(r'(\d{2}:\d{2}:\d{2})', expand=False),
   "user": reviews[col].str.extract(r'\d{2}:\d{2}:\d{2}, (\S+)', expand=False),
   "city": reviews[col].str.extract(r'\d{2}:\d{2}:\d{2}, \S+, (.+?), 0\.\d{2}$', expand=False),
   "confidence": reviews[col].str.extract(r'(0\.\d{2}$)', expand=False)
   })

df_clean["dateTime"] = df_clean["date"]+" "+df_clean["time"]


In [None]:
df_clean.head()

In [None]:
df_clean.dtypes

### Changing DataTypes

In [None]:
df_clean["sentiment"].value_counts()

In [None]:
reviews_df = df_clean.copy()
reviews_df["sentiment"] = np.where(reviews_df["sentiment"] == "Positive", 1, np.where(reviews_df["sentiment"] == "Negative", 0, np.nan))
reviews_df

In [None]:
#changing datatypes
from datetime import datetime
categories = {"text":"string",
              "sentiment":"category",
              "source":"category",
              "date": "datetime64[ns]",
              "time":"string",
              "dateTime":"datetime64[ns]",
              "user":"category",
              "city":"category",
              "confidence":"float"}


reviews_clean = reviews_df.astype(categories)
reviews_clean.head()


In [None]:
reviews_clean.dtypes

In [None]:
import missingno as msno 

print(msno.matrix(reviews_clean))
print(reviews_clean.isna().sum())



Looking at the amount of missing per column, their locations, and specifically the amount in source, we'll use .dropna() subsetted for all other cols

In [None]:
# Assuming 'reviews_clean' is your DataFrame
reviews_clean.dropna(subset=reviews_clean.columns.difference(['source']), inplace= True)
print(reviews_clean.isna().sum(),
reviews_clean.shape)


In [None]:
reviews = reviews_clean[["text","sentiment"]].values.tolist()
reviews[:10]

Modelling using CNN and a fully connected linear layer. Starting with embedding.

In [None]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

In [None]:
# Define your Dataset class
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words = set(stopwords.words("english"))

tokenizer = get_tokenizer("basic_english")
stemmer = PorterStemmer() 

class CustomerReview(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]


def preprocess_sentences(sentences):
    processed_sentences = []
    for sentence,label in sentences:
        sentence = sentence.lower()
        tokens = tokenizer(sentence)
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [stemmer.stem(token) for token in tokens]
        processed_sentences.append(' '.join(tokens))
    return processed_sentences

# Complete the encoding function
def encode_sentences(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    return X.toarray(), vectorizer
    
# Complete the text processing pipeline
def text_processing_pipeline(sentences):
    processed_sentences = preprocess_sentences(sentences)
    encoded_sentences, vectorizer = encode_sentences(processed_sentences)
    dataset = CustomerReview(encoded_sentences)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    return dataloader, vectorizer


### Creating a CNN Network for the reviews

In [None]:
reviews_dict = dict(reviews)
X_train, X_test, y_train, y_test = train_test_split(list(reviews_dict.keys()), list(reviews_dict.values()), test_size=0.3, random_state=123)


In [None]:
train_data = zip(X_train, y_train)
test_data = zip(X_test, y_test)

train_dataloader, train_vectorizer = text_processing_pipeline(train_data)
test_dataloader, test_vectorizer = text_processing_pipeline(test_data)


In [None]:
print(len(y_test))

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F


class TextClassificationCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(TextClassificationCNN, self).__init__()
        # Initialize the embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(1, embed_dim, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(embed_dim, 2)

    def forward(self, text):
        text = text.unsqueeze(1)
        embedded = self.embedding(text).permute(0, 2, 1)
        # Pass the embedded text through the convolutional layer and apply a ReLU
        conved = F.relu(self.conv(embedded))
        conved = conved.mean(dim=2)
        return self.fc(conved)
    
    def reset_parameters(self):
        for param in self.parameters():
            if param.requires_grad:
                param.data.uniform_(-0.1, 0.1)  




In [None]:
import torch.optim as optim  
import torch.nn.functional as F

criterion = nn.CrossEntropyLoss()
model = TextClassificationCNN(vocab_size=len(train_vectorizer.get_feature_names_out()),embed_dim=50)
optimizer = optim.SGD(model.parameters(), lr=0.01)
epochs = 20
def training_loop(epochs):
    for epoch in range(epochs):
        model.train()
        for inputs, labels in train_dataloader:
            labels = torch.clamp(labels, 0, 1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.squeeze())
            loss.backward()
            optimizer.step()



In [None]:
# Testing loop
accuracy_dict={}
def testing_reviews():
    model.eval()
    correct_predictions = 0
    total_samples = 0
    i =0
    with torch.no_grad():
        for inputs, labels in test_dataloader:
                outputs = model(inputs)
                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == labels.squeeze()).sum().item()
                total_samples += labels.size(0)
                accuracy = correct_predictions / total_samples
                i+=1
                if i == 11:
                    break
                return accuracy

In [None]:
test_acc={}
for i in np.arange(10,21,1):
    training_loop(i)
    accuracy = testing_reviews()
    test_acc[i]=accuracy

train_acc={}
for i in np.arange(10,21,1):
    training_loop(i)
    accuracy = testing_reviews()
    train_acc[i]=accuracy

In [None]:
import matplotlib.pyplot as plt

plt.plot(test_acc.keys(), test_acc.values(), marker='o', markerfacecolor='darkgreen', label='Test Accuracy')
plt.plot(train_acc.keys(), train_acc.values(), marker='v', c='red', markerfacecolor='black', label='Train Accuracy')

plt.xlabel("Epoch")
plt.xticks(rotation=45)
plt.ylabel('Accuracy')
plt.title("Epoch vs. Accuracy for Model")
plt.legend()  
plt.show()



In [None]:
max_epoch = max(test_acc, key=lambda k: accuracy_dict[k])
max_accuracy = test_acc[max_epoch]

print(f"Maximum accuracy ({max_accuracy * 100:.2f}%) achieved at epoch {max_epoch + 1}")
