In [1]:
import pandas as pd
nRowsRead = 1000 # specify 'None' if want to read whole file
# Womens Clothing E-Commerce Reviews.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df = pd.read_csv(r'C:\Users\sheet\Downloads\Womens Clothing E-Commerce Reviews\Womens Clothing E-Commerce Reviews.csv', delimiter=',', nrows = nRowsRead)
df.dataframeName = 'Womens Clothing E-Commerce Reviews.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 1000 rows and 11 columns


In [2]:
df.drop(columns=['Unnamed: 0'],inplace=True)
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


# Handling Missing Values: 
Check for missing values Decide whether to drop or fill missing values based on the dataset size and context.

In [3]:
df.isnull().sum()

Clothing ID                  0
Age                          0
Title                      190
Review Text                 42
Rating                       0
Recommended IND              0
Positive Feedback Count      0
Division Name                0
Department Name              0
Class Name                   0
dtype: int64

In [4]:
#filled the missing title with review text
df['Title'].fillna(df['Review Text'], inplace=True)
df.isnull().sum()


Clothing ID                 0
Age                         0
Title                      42
Review Text                42
Rating                      0
Recommended IND             0
Positive Feedback Count     0
Division Name               0
Department Name             0
Class Name                  0
dtype: int64

In [5]:
#further dropping the 42 rows, as it comprises only 4% of the total data
df = df.dropna(subset=['Review Text'])
df.isnull().sum()

Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
dtype: int64

## Text Cleaning:

1) **Lowercasing**: Convert all text to lowercase to ensure uniformity.

2) **Special Characters and Numbers**: Remove any special characters, numbers, and symbols from the text.

3) **Tokenization**: Break down the text into individual words or tokens.

4) **Stop Words**: Remove common stop words (e.g., 'the', 'and', 'is') as they usually don't contribute much to sentiment.

5) *Lemmatization*: Reduce words to their base or root form. This helps in consolidating words with similar meanings.

Depending on your analysis goals, you might choose to drop columns that are not relevant for sentiment analysis. Check for the difference between lemmetization and stemming

In [6]:
# !pip install nltk

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Function to preprocess text
def preprocess_text(text):
    # Check for NaN values
    if pd.isnull(text):
        return ''
    
    # Lowercasing
    text = text.lower()
    
    # Removing special characters and numbers
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing to 'Review Text'
df['Review Text'] = df['Review Text'].apply(preprocess_text)

# Apply preprocessing to 'Title'
df['Title'] = df['Title'].apply(preprocess_text)
df

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,absolutely wonderful silky sexy comfortable,absolutely wonderful silky sexy comfortable,4,1,0,Initmates,Intimate,Intimates
1,1080,34,love dress sooo pretty happened find store im ...,love dress sooo pretty happened find store im ...,5,1,4,General,Dresses,Dresses
2,1077,60,major design flaw,high hope dress really wanted work initially o...,3,0,0,General,Dresses,Dresses
3,1049,50,favorite buy,love love love jumpsuit fun flirty fabulous ev...,5,1,0,General Petite,Bottoms,Pants
4,847,47,flattering shirt,shirt flattering due adjustable front tie perf...,5,1,6,General,Tops,Blouses
...,...,...,...,...,...,...,...,...,...,...
995,1047,70,received christmas gift daughter wonderful war...,received christmas gift daughter wonderful war...,5,1,3,General,Bottoms,Pants
996,936,37,gorgeous,every year around time beloved retailer come w...,5,1,8,General,Tops,Sweaters
997,936,36,gorgeous,tried sweater store immediately ordered size g...,5,1,1,General,Tops,Sweaters
998,854,29,soft,super soft comfortable run little large cozy,5,1,0,General Petite,Tops,Knits


In [8]:
df.describe()

Unnamed: 0,Clothing ID,Age,Rating,Recommended IND,Positive Feedback Count
count,958.0,958.0,958.0,958.0,958.0
mean,909.411273,43.653445,4.179541,0.814196,2.728601
std,199.349768,12.422842,1.082355,0.389151,5.796933
min,2.0,20.0,1.0,0.0,0.0
25%,850.0,35.0,4.0,1.0,0.0
50%,907.0,41.0,5.0,1.0,1.0
75%,1060.0,52.0,5.0,1.0,3.0
max,1196.0,93.0,5.0,1.0,84.0


We see that there are no outliers in the numerical columns

In [9]:
# categorize ratings as positive, negative, or neutral
def categorize_rating(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

# Apply the categorization to create a new 'Sentiment' column
df['Sentiment'] = df['Rating'].apply(categorize_rating)
df

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Sentiment
0,767,33,absolutely wonderful silky sexy comfortable,absolutely wonderful silky sexy comfortable,4,1,0,Initmates,Intimate,Intimates,positive
1,1080,34,love dress sooo pretty happened find store im ...,love dress sooo pretty happened find store im ...,5,1,4,General,Dresses,Dresses,positive
2,1077,60,major design flaw,high hope dress really wanted work initially o...,3,0,0,General,Dresses,Dresses,neutral
3,1049,50,favorite buy,love love love jumpsuit fun flirty fabulous ev...,5,1,0,General Petite,Bottoms,Pants,positive
4,847,47,flattering shirt,shirt flattering due adjustable front tie perf...,5,1,6,General,Tops,Blouses,positive
...,...,...,...,...,...,...,...,...,...,...,...
995,1047,70,received christmas gift daughter wonderful war...,received christmas gift daughter wonderful war...,5,1,3,General,Bottoms,Pants,positive
996,936,37,gorgeous,every year around time beloved retailer come w...,5,1,8,General,Tops,Sweaters,positive
997,936,36,gorgeous,tried sweater store immediately ordered size g...,5,1,1,General,Tops,Sweaters,positive
998,854,29,soft,super soft comfortable run little large cozy,5,1,0,General Petite,Tops,Knits,positive


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['Review Text'], df['Sentiment'], test_size=0.3, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=650)  # You can adjust max_features based on your dataset size
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_vectorized, y_train)

# Predictions
svm_predictions = svm_model.predict(X_test_vectorized)

# Evaluation
print("Support Vector Machines (SVM) Results:")
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))


Support Vector Machines (SVM) Results:
Accuracy: 0.7881944444444444
              precision    recall  f1-score   support

    negative       0.90      0.31      0.46        29
     neutral       0.17      0.03      0.04        39
    positive       0.80      0.99      0.88       220

    accuracy                           0.79       288
   macro avg       0.62      0.44      0.46       288
weighted avg       0.72      0.79      0.73       288



In [73]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack
# Selecting features
selected_features = ['Age', 'Review Text', 'Rating','Title', 'Positive Feedback Count','Recommended IND', 'Division Name', 'Department Name', 'Class Name']
X = df[selected_features]
y = df['Sentiment']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Vectorize text data
vectorizer_text = TfidfVectorizer(max_features=650)
X_train_text_vectorized = vectorizer_text.fit_transform(X_train['Review Text'])
X_test_text_vectorized = vectorizer_text.transform(X_test['Review Text'])

# Vectorize title data
vectorizer_title = TfidfVectorizer(max_features=200)  # You can adjust max_features based on your dataset size
X_train_title_vectorized = vectorizer_title.fit_transform(X_train['Title'])
X_test_title_vectorized = vectorizer_title.transform(X_test['Title'])

# Combine text features with other selected features
X_train_vectorized = hstack([X_train_text_vectorized, X_train_title_vectorized, X_train[['Recommended IND','Positive Feedback Count']]])
X_test_vectorized = hstack([X_test_text_vectorized, X_test_title_vectorized, X_test[['Recommended IND','Positive Feedback Count']]])

# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_vectorized, y_train)

# Predictions
svm_predictions = svm_model.predict(X_test_vectorized)

# Evaluation
print("Support Vector Machines (SVM) Results:")
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))


Support Vector Machines (SVM) Results:
Accuracy: 0.8611111111111112
              precision    recall  f1-score   support

    negative       0.57      0.69      0.62        29
     neutral       0.50      0.23      0.32        39
    positive       0.93      1.00      0.96       220

    accuracy                           0.86       288
   macro avg       0.67      0.64      0.63       288
weighted avg       0.84      0.86      0.84       288



In [74]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# CatBoost
catboost_model = CatBoostClassifier(iterations=400, depth=5, learning_rate=0.1, loss_function='MultiClass')
catboost_model.fit(X_train_vectorized, y_train)

# Predictions
catboost_predictions = catboost_model.predict(X_test_vectorized)

# Evaluate the CatBoost model
print("\nCatBoost Results:")
print("Accuracy:", accuracy_score(y_test, catboost_predictions))
print(classification_report(y_test, catboost_predictions))


0:	learn: 0.9712611	total: 21.2ms	remaining: 8.44s
1:	learn: 0.8747219	total: 38.7ms	remaining: 7.7s
2:	learn: 0.7972876	total: 54.5ms	remaining: 7.21s
3:	learn: 0.7344461	total: 70.7ms	remaining: 7s
4:	learn: 0.6804152	total: 86.3ms	remaining: 6.82s
5:	learn: 0.6342612	total: 102ms	remaining: 6.69s
6:	learn: 0.5964269	total: 117ms	remaining: 6.59s
7:	learn: 0.5630736	total: 133ms	remaining: 6.53s
8:	learn: 0.5354551	total: 149ms	remaining: 6.46s
9:	learn: 0.5114977	total: 164ms	remaining: 6.39s
10:	learn: 0.4902582	total: 179ms	remaining: 6.34s
11:	learn: 0.4710860	total: 195ms	remaining: 6.3s
12:	learn: 0.4603218	total: 211ms	remaining: 6.27s
13:	learn: 0.4452511	total: 227ms	remaining: 6.26s
14:	learn: 0.4378049	total: 242ms	remaining: 6.22s
15:	learn: 0.4261011	total: 258ms	remaining: 6.19s
16:	learn: 0.4137371	total: 273ms	remaining: 6.16s
17:	learn: 0.4087807	total: 289ms	remaining: 6.13s
18:	learn: 0.3985743	total: 303ms	remaining: 6.08s
19:	learn: 0.3911176	total: 318ms	remaini

170:	learn: 0.2063347	total: 2.62s	remaining: 3.51s
171:	learn: 0.2051407	total: 2.64s	remaining: 3.5s
172:	learn: 0.2045984	total: 2.65s	remaining: 3.48s
173:	learn: 0.2040380	total: 2.67s	remaining: 3.47s
174:	learn: 0.2034715	total: 2.68s	remaining: 3.45s
175:	learn: 0.2031690	total: 2.7s	remaining: 3.43s
176:	learn: 0.2027309	total: 2.71s	remaining: 3.42s
177:	learn: 0.2022708	total: 2.73s	remaining: 3.4s
178:	learn: 0.2018716	total: 2.74s	remaining: 3.38s
179:	learn: 0.2012926	total: 2.75s	remaining: 3.37s
180:	learn: 0.2001498	total: 2.77s	remaining: 3.35s
181:	learn: 0.1997724	total: 2.78s	remaining: 3.33s
182:	learn: 0.1992852	total: 2.8s	remaining: 3.32s
183:	learn: 0.1989709	total: 2.81s	remaining: 3.3s
184:	learn: 0.1987268	total: 2.83s	remaining: 3.29s
185:	learn: 0.1983392	total: 2.85s	remaining: 3.27s
186:	learn: 0.1980714	total: 2.86s	remaining: 3.26s
187:	learn: 0.1977649	total: 2.88s	remaining: 3.24s
188:	learn: 0.1972624	total: 2.89s	remaining: 3.23s
189:	learn: 0.196

329:	learn: 0.1410034	total: 5.03s	remaining: 1.07s
330:	learn: 0.1407420	total: 5.04s	remaining: 1.05s
331:	learn: 0.1405444	total: 5.06s	remaining: 1.04s
332:	learn: 0.1403682	total: 5.07s	remaining: 1.02s
333:	learn: 0.1400961	total: 5.09s	remaining: 1s
334:	learn: 0.1397194	total: 5.1s	remaining: 990ms
335:	learn: 0.1395319	total: 5.12s	remaining: 975ms
336:	learn: 0.1388751	total: 5.13s	remaining: 959ms
337:	learn: 0.1386390	total: 5.15s	remaining: 944ms
338:	learn: 0.1384799	total: 5.16s	remaining: 929ms
339:	learn: 0.1381193	total: 5.17s	remaining: 913ms
340:	learn: 0.1379233	total: 5.19s	remaining: 898ms
341:	learn: 0.1376609	total: 5.2s	remaining: 883ms
342:	learn: 0.1374825	total: 5.22s	remaining: 867ms
343:	learn: 0.1370295	total: 5.23s	remaining: 852ms
344:	learn: 0.1367870	total: 5.25s	remaining: 837ms
345:	learn: 0.1364118	total: 5.26s	remaining: 822ms
346:	learn: 0.1362739	total: 5.28s	remaining: 806ms
347:	learn: 0.1361275	total: 5.29s	remaining: 791ms
348:	learn: 0.135

In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vectorized, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test_vectorized)

# Evaluation
print("Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))


Random Forest Results:
Accuracy: 0.875
              precision    recall  f1-score   support

    negative       0.65      0.76      0.70        29
     neutral       0.60      0.31      0.41        39
    positive       0.93      0.99      0.96       220

    accuracy                           0.88       288
   macro avg       0.73      0.69      0.69       288
weighted avg       0.86      0.88      0.86       288



In [12]:
# !pip install transformers
# !pip install torch


In [13]:
# import torch
# from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from transformers import BertTokenizer, BertForSequenceClassification, AdamW
# from sklearn.preprocessing import LabelEncoder # to encode target class,+ve ,-ve  and neutral


# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_test_encoded = label_encoder.transform(y_test)



# # Load the pre-trained BERT model and tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Assuming 3 classes for sentiment

# # Tokenize and encode the training and testing data
# X_train_tokens = tokenizer.batch_encode_plus(X_train.tolist(), padding=True, truncation=True, return_tensors='pt')
# X_test_tokens = tokenizer.batch_encode_plus(X_test.tolist(), padding=True, truncation=True, return_tensors='pt')

# # Create TensorDatasets
# train_dataset = TensorDataset(X_train_tokens['input_ids'], X_train_tokens['attention_mask'], torch.tensor(y_train_encoded,dtype=torch.long))
# test_dataset = TensorDataset(X_test_tokens['input_ids'], X_test_tokens['attention_mask'], torch.tensor(y_test_encoded,dtype=torch.long))



In [14]:

# # Create DataLoader
# batch_size = 32
# train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
# test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

# # Set up GPU/CPU device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Set up optimizer and training parameters
# optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
# epochs = 3


In [15]:

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation loop
    model.eval()
    val_loss = 0
    val_accuracy = 0

    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        val_loss += outputs.loss.item()

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        val_accuracy += torch.sum(preds == labels).item()

    avg_val_loss = val_loss / len(test_dataloader)
    val_accuracy = val_accuracy / len(X_test)

    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Training Loss: {avg_train_loss}')
    print(f'Validation Loss: {avg_val_loss}')
    print(f'Validation Accuracy: {val_accuracy}')


NameError: name 'epochs' is not defined

In [16]:

# Evaluation
model.eval()
predictions = []

for batch in test_dataloader:
    input_ids, attention_mask, _ = batch
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    predictions.extend(preds.cpu().numpy())

# Evaluation metrics
print("BERT Results:")
print("Accuracy:", accuracy_score(y_test_encoded, predictions))
print(classification_report(y_test_encoded, predictions))

NameError: name 'model' is not defined

In [17]:
#!pip install dash dash-bootstrap-components pandas plotly


In [28]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px

# Assume you have a DataFrame called 'sentiment_data' with columns: 'Feedback', 'Sentiment', 'Date'
from wordcloud import WordCloud
import base64
# Sample Data
import pandas as pd

import plotly.express as px
# Generate a Stacked Bar Chart for Department Name by Sentiment
stacked_bar_chart = px.bar(df, x='Department Name', color='Sentiment', title='Sentiment Distribution by Department', barmode='stack')
# Generate a Heatmap for correlation between numeric columns
correlation_heatmap = px.imshow(df.corr(), x=df.corr().columns, y=df.corr().columns, title='Correlation Heatmap')

# Count plot of Division Name and Department Name
count_plot = px.histogram(df, x='Division Name', color='Department Name', title='Count Plot of Division Name and Department Name')

# Group by Rating and Department Name, and then count the occurrences
rating_department_count = df.groupby(['Rating', 'Department Name']).size().reset_index(name='Count')

# Stacked bar chart of Department count for each Rating
rating_department_count_chart = px.bar(rating_department_count, x='Rating', y='Count', color='Department Name',
                                       title='Department Count for Each Rating', barmode='stack')

recommended_sentiment_count = df.groupby(['Sentiment', 'Recommended IND']).size().reset_index(name='Count')
timent = px.bar(df, x='Sentiment', color='Recommended IND', title='Count of Recommended Products by Sentiment', barmode='group')
# Group by Sentiment and Recommended IND, and then count the occurrences
recommended_sentiment_count = df.groupby(['Sentiment', 'Recommended IND']).size().reset_index(name='Count')


app = dash.Dash(__name__)

#Define the layout of your dashboard
app.layout = html.Div([
    html.H1("Customer Feedback Dashboard"),
    
    dcc.Graph(
        id='sentiment-pie-chart',
        figure=px.pie(df, values='Rating', names='Sentiment', title='Sentiment Distribution')
    ),
    
    # Count plot of Division Name and Department Name
    dcc.Graph(
        id='count-plot',
        figure=count_plot
    ),
    
     dcc.Graph(
        id='age-histogram',
        figure=px.histogram(df, x='Age', nbins=10, title='Histogram of Age')
    ),
     
     
    # Stacked Bar Chart of Department Name by Sentiment
    dcc.Graph(
        id='sentiment-stacked-bar-chart',
        figure=stacked_bar_chart
    ),
    # Heatmap of Correlation
    dcc.Graph(
        id='correlation-heatmap',
        figure=correlation_heatmap
    ),
     # Stacked bar chart of Department count for each Rating
    dcc.Graph(
        id='rating-department-count-chart',
        figure=rating_department_count_chart
    )
])

if __name__ == '__main__':
    app.run_server(debug=True,mode="inline")
# Expected one of ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 
# 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name', 'Sentiment'] 


In [None]:
#!pip install wordcloud
